BamWriter.cpp

   1 // ***************************************************************************
   2 // BamWriter (c) 2009 Michael Strömberg
   3 // Marth Lab, Deptartment of Biology, Boston College
   4 // All rights reserved.
   5 // ---------------------------------------------------------------------------
   6 // Provides the basic functionality for producing BAM files
   7 // ***************************************************************************
   8
   9 #include "BamWriter.h"
  10
  11 // constructor
  12 BamWriter::BamWriter(void)
  13 {}
  14
  15 // destructor
  16 BamWriter::~BamWriter(void) {
  17         if(mBGZF.IsOpen) BgzfClose();
  18 }
  19
  20 // closes the BAM file
  21 void BamWriter::BgzfClose(void) {
  22
  23         mBGZF.IsOpen = false;
  24
  25         // flush the BGZF block
  26         BgzfFlushBlock();
  27
  28         // flush and close
  29         fflush(mBGZF.Stream);
  30         fclose(mBGZF.Stream);
  31 }
  32
  33 // compresses the current block
  34 int BamWriter::BgzfDeflateBlock(void) {
  35
  36         // initialize the gzip header
  37         char* buffer = mBGZF.CompressedBlock;
  38         unsigned int bufferSize = mBGZF.CompressedBlockSize;
  39
  40         memset(buffer, 0, 18);
  41         buffer[0]  = GZIP_ID1;
  42         buffer[1]  = (char)GZIP_ID2;
  43         buffer[2]  = CM_DEFLATE;
  44         buffer[3]  = FLG_FEXTRA;
  45         buffer[9]  = (char)OS_UNKNOWN;
  46         buffer[10] = BGZF_XLEN;
  47         buffer[12] = BGZF_ID1;
  48         buffer[13] = BGZF_ID2;
  49         buffer[14] = BGZF_LEN;
  50
  51         // loop to retry for blocks that do not compress enough
  52         int inputLength = mBGZF.BlockOffset;
  53         int compressedLength = 0;
  54
  55         while(true) {
  56
  57                 z_stream zs;
  58                 zs.zalloc    = NULL;
  59                 zs.zfree     = NULL;
  60                 zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
  61                 zs.avail_in  = inputLength;
  62                 zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
  63                 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
  64
  65                 // initialize the zlib compression algorithm
  66                 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
  67                         printf("ERROR: zlib deflate initialization failed.\n");
  68                         exit(1);
  69                 }
  70
  71                 // compress the data
  72                 int status = deflate(&zs, Z_FINISH);
  73                 if(status != Z_STREAM_END) {
  74                         deflateEnd(&zs);
  75
  76                         // reduce the input length and try again
  77                         if(status == Z_OK) {
  78                                 inputLength -= 1024;
  79                                 if(inputLength < 0) {
  80                                         printf("ERROR: input reduction failed.\n");
  81                                         exit(1);
  82                                 }
  83                                 continue;
  84                         }
  85
  86                         printf("ERROR: zlib deflate failed.\n");
  87                         exit(1);
  88                 }
  89
  90                 // finalize the compression routine
  91                 if(deflateEnd(&zs) != Z_OK) {
  92                         printf("ERROR: deflate end failed.\n");
  93                         exit(1);
  94                 }
  95
  96                 compressedLength = zs.total_out;
  97                 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
  98
  99                 if(compressedLength > MAX_BLOCK_SIZE) {
 100                         printf("ERROR: deflate overflow.\n");
 101                         exit(1);
 102                 }
 103
 104                 break;
 105         }
 106
 107         // store the compressed length
 108         BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
 109
 110         // store the CRC32 checksum
 111         unsigned int crc = crc32(0, NULL, 0);
 112         crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
 113         BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
 114         BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
 115
 116         // ensure that we have less than a block of data left
 117         int remaining = mBGZF.BlockOffset - inputLength;
 118         if(remaining > 0) {
 119                 if(remaining > inputLength) {
 120                         printf("ERROR: remainder too large.\n");
 121                         exit(1);
 122                 }
 123
 124                 memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
 125         }
 126
 127         mBGZF.BlockOffset = remaining;
 128         return compressedLength;
 129 }
 130
 131 // flushes the data in the BGZF block
 132 void BamWriter::BgzfFlushBlock(void) {
 133
 134         // flush all of the remaining blocks
 135         while(mBGZF.BlockOffset > 0) {
 136
 137                 // compress the data block
 138                 int blockLength = BgzfDeflateBlock();
 139
 140                 // flush the data to our output stream
 141                 int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
 142
 143                 if(numBytesWritten != blockLength) {
 144                         printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
 145                         exit(1);
 146                 }
 147
 148                 mBGZF.BlockAddress += blockLength;
 149         }
 150 }
 151
 152 // opens the BAM file for writing
 153 void BamWriter::BgzfOpen(const string& filename) {
 154
 155         mBGZF.Stream = fopen(filename.c_str(), "wb");
 156
 157         if(!mBGZF.Stream) {
 158                 printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
 159                 exit(1);
 160         }
 161
 162         mBGZF.IsOpen = true;
 163 }
 164
 165 // writes the supplied data into the BGZF buffer
 166 unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
 167
 168         // initialize
 169         unsigned int numBytesWritten = 0;
 170         const char* input = data;
 171         unsigned int blockLength = mBGZF.UncompressedBlockSize;
 172
 173         // copy the data to the buffer
 174         while(numBytesWritten < dataLen) {
 175                 unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
 176                 char* buffer = mBGZF.UncompressedBlock;
 177                 memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
 178
 179                 mBGZF.BlockOffset += copyLength;
 180                 input             += copyLength;
 181                 numBytesWritten   += copyLength;
 182
 183                 if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
 184         }
 185
 186         return numBytesWritten;
 187 }
 188
 189 // closes the alignment archive
 190 void BamWriter::Close(void) {
 191         if(mBGZF.IsOpen) BgzfClose();
 192 }
 193
 194 // creates a cigar string from the supplied alignment
 195 void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
 196
 197         // initialize
 198         const unsigned int numCigarOperations = cigarOperations.size();
 199         packedCigar.resize(numCigarOperations * SIZEOF_INT);
 200
 201         // pack the cigar data into the string
 202         unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
 203
 204         unsigned int cigarOp;
 205         vector<CigarOp>::const_iterator coIter;
 206         for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
 207
 208                 switch(coIter->Type) {
 209           case 'M':
 210                   cigarOp = BAM_CMATCH;
 211                   break;
 212           case 'I':
 213                   cigarOp = BAM_CINS;
 214                   break;
 215           case 'D':
 216                   cigarOp = BAM_CDEL;
 217                   break;
 218           case 'N':
 219                   cigarOp = BAM_CREF_SKIP;
 220                   break;
 221           case 'S':
 222                   cigarOp = BAM_CSOFT_CLIP;
 223                   break;
 224           case 'H':
 225                   cigarOp = BAM_CHARD_CLIP;
 226                   break;
 227           case 'P':
 228                   cigarOp = BAM_CPAD;
 229                   break;
 230           default:
 231                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
 232                   exit(1);
 233                 }
 234
 235                 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
 236                 pPackedCigar++;
 237         }
 238 }
 239
 240 // encodes the supplied query sequence into 4-bit notation
 241 void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
 242
 243         // prepare the encoded query string
 244         const unsigned int queryLen = query.size();
 245         const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
 246         encodedQuery.resize(encodedQueryLen);
 247         char* pEncodedQuery = (char*)encodedQuery.data();
 248         const char* pQuery = (const char*)query.data();
 249
 250         unsigned char nucleotideCode;
 251         bool useHighWord = true;
 252
 253         while(*pQuery) {
 254
 255                 switch(*pQuery) {
 256                         case '=':
 257                                 nucleotideCode = 0;
 258                                 break;
 259                         case 'A':
 260                                 nucleotideCode = 1;
 261                                 break;
 262                         case 'C':
 263                                 nucleotideCode = 2;
 264                                 break;
 265                         case 'G':
 266                                 nucleotideCode = 4;
 267                                 break;
 268                         case 'T':
 269                                 nucleotideCode = 8;
 270                                 break;
 271                         case 'N':
 272                                 nucleotideCode = 15;
 273                                 break;
 274                         default:
 275                                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
 276                                 exit(1);
 277                 }
 278
 279                 // pack the nucleotide code
 280                 if(useHighWord) {
 281                         *pEncodedQuery = nucleotideCode << 4;
 282                         useHighWord = false;
 283                 } else {
 284                         *pEncodedQuery |= nucleotideCode;
 285                         pEncodedQuery++;
 286                         useHighWord = true;
 287                 }
 288
 289                 // increment the query position
 290                 pQuery++;
 291         }
 292 }
 293
 294 // opens the alignment archive
 295 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
 296
 297         // open the BGZF file for writing
 298         BgzfOpen(filename);
 299
 300         // ================
 301         // write the header
 302         // ================
 303
 304         // write the BAM signature
 305         const unsigned char SIGNATURE_LENGTH = 4;
 306         const char* BAM_SIGNATURE = "BAM\1";
 307         BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
 308
 309         // write the SAM header text length
 310         const unsigned int samHeaderLen = samHeader.size();
 311         BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
 312
 313         // write the SAM header text
 314         if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
 315
 316         // write the number of reference sequences
 317         const unsigned int numReferenceSequences = referenceSequences.size();
 318         BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
 319
 320         // =============================
 321         // write the sequence dictionary
 322         // =============================
 323
 324         RefVector::const_iterator rsIter;
 325         for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
 326
 327                 // write the reference sequence name length
 328                 const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
 329                 BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
 330
 331                 // write the reference sequence name
 332                 BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
 333
 334                 // write the reference sequence length
 335                 BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
 336         }
 337 }
 338
 339 // saves the alignment to the alignment archive
 340 void BamWriter::SaveAlignment(const BamAlignment& al) {
 341
 342         // initialize
 343         const unsigned int nameLen            = al.Name.size() + 1;
 344         const unsigned int queryLen           = al.QueryBases.size();
 345         const unsigned int numCigarOperations = al.CigarData.size();
 346
 347         // create our packed cigar string
 348         string packedCigar;
 349         CreatePackedCigar(al.CigarData, packedCigar);
 350         const unsigned int packedCigarLen = packedCigar.size();
 351
 352         // encode the query
 353         string encodedQuery;
 354         EncodeQuerySequence(al.QueryBases, encodedQuery);
 355         const unsigned int encodedQueryLen = encodedQuery.size();
 356
 357         // create our read group tag
 358         // TODO: add read group tag support when it shows up in the BamAlignment struct
 359         //string readGroupTag;
 360         //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
 361         //readGroupTag.resize(readGroupTagLen);
 362         //char* pReadGroupTag = (char*)readGroupTag.data();
 363         //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
 364
 365         // assign the BAM core data
 366         unsigned int buffer[8];
 367         buffer[0] = al.RefID;
 368         buffer[1] = al.Position;
 369         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
 370         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
 371         buffer[4] = queryLen;
 372         buffer[5] = al.MateRefID;
 373         buffer[6] = al.MatePosition;
 374         buffer[7] = al.InsertSize;
 375
 376         // write the block size
 377         const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
 378         const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
 379
 380         BgzfWrite((char*)&blockSize, SIZEOF_INT);
 381
 382         // write the BAM core
 383         BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
 384
 385         // write the query name
 386         BgzfWrite(al.Name.c_str(), nameLen);
 387
 388         // write the packed cigar
 389         BgzfWrite(packedCigar.data(), packedCigarLen);
 390
 391         // write the encoded query sequence
 392         BgzfWrite(encodedQuery.data(), encodedQueryLen);
 393
 394         // write the base qualities
 395         string baseQualities = al.Qualities;
 396         char* pBaseQualities = (char*)al.Qualities.data();
 397         for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
 398         BgzfWrite(pBaseQualities, queryLen);
 399
 400         // write the read group tag
 401         // TODO: add read group tag support when it shows up in the BamAlignment struct
 402         //BgzfWrite(readGroupTag.c_str(), readGroupTagLen);
 403 }