BamWriter.cpp

   1 // ***************************************************************************
   2 // BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett
   3 // Marth Lab, Department of Biology, Boston College
   4 // All rights reserved.
   5 // ---------------------------------------------------------------------------
   6 // Last modified: 24 June 2009 (DB)
   7 // ---------------------------------------------------------------------------
   8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
   9 // Institute.
  10 // ---------------------------------------------------------------------------
  11 // Provides the basic functionality for producing BAM files
  12 // ***************************************************************************
  13
  14 #include "BamWriter.h"
  15 using namespace BamTools;
  16 using namespace std;
  17
  18 // constructor
  19 BamWriter::BamWriter(void)
  20 {}
  21
  22 // destructor
  23 BamWriter::~BamWriter(void) {
  24         if(mBGZF.IsOpen) BgzfClose();
  25 }
  26
  27 // closes the BAM file
  28 void BamWriter::BgzfClose(void) {
  29
  30         mBGZF.IsOpen = false;
  31
  32         // flush the BGZF block
  33         BgzfFlushBlock();
  34
  35         // flush and close
  36         fflush(mBGZF.Stream);
  37         fclose(mBGZF.Stream);
  38 }
  39
  40 // compresses the current block
  41 int BamWriter::BgzfDeflateBlock(void) {
  42
  43         // initialize the gzip header
  44         char* buffer = mBGZF.CompressedBlock;
  45         unsigned int bufferSize = mBGZF.CompressedBlockSize;
  46
  47         memset(buffer, 0, 18);
  48         buffer[0]  = GZIP_ID1;
  49         buffer[1]  = (char)GZIP_ID2;
  50         buffer[2]  = CM_DEFLATE;
  51         buffer[3]  = FLG_FEXTRA;
  52         buffer[9]  = (char)OS_UNKNOWN;
  53         buffer[10] = BGZF_XLEN;
  54         buffer[12] = BGZF_ID1;
  55         buffer[13] = BGZF_ID2;
  56         buffer[14] = BGZF_LEN;
  57
  58         // loop to retry for blocks that do not compress enough
  59         int inputLength = mBGZF.BlockOffset;
  60         int compressedLength = 0;
  61
  62         while(true) {
  63
  64                 z_stream zs;
  65                 zs.zalloc    = NULL;
  66                 zs.zfree     = NULL;
  67                 zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
  68                 zs.avail_in  = inputLength;
  69                 zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
  70                 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
  71
  72                 // initialize the zlib compression algorithm
  73                 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
  74                         printf("ERROR: zlib deflate initialization failed.\n");
  75                         exit(1);
  76                 }
  77
  78                 // compress the data
  79                 int status = deflate(&zs, Z_FINISH);
  80                 if(status != Z_STREAM_END) {
  81                         deflateEnd(&zs);
  82
  83                         // reduce the input length and try again
  84                         if(status == Z_OK) {
  85                                 inputLength -= 1024;
  86                                 if(inputLength < 0) {
  87                                         printf("ERROR: input reduction failed.\n");
  88                                         exit(1);
  89                                 }
  90                                 continue;
  91                         }
  92
  93                         printf("ERROR: zlib deflate failed.\n");
  94                         exit(1);
  95                 }
  96
  97                 // finalize the compression routine
  98                 if(deflateEnd(&zs) != Z_OK) {
  99                         printf("ERROR: deflate end failed.\n");
 100                         exit(1);
 101                 }
 102
 103                 compressedLength = zs.total_out;
 104                 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
 105
 106                 if(compressedLength > MAX_BLOCK_SIZE) {
 107                         printf("ERROR: deflate overflow.\n");
 108                         exit(1);
 109                 }
 110
 111                 break;
 112         }
 113
 114         // store the compressed length
 115         BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
 116
 117         // store the CRC32 checksum
 118         unsigned int crc = crc32(0, NULL, 0);
 119         crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
 120         BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
 121         BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
 122
 123         // ensure that we have less than a block of data left
 124         int remaining = mBGZF.BlockOffset - inputLength;
 125         if(remaining > 0) {
 126                 if(remaining > inputLength) {
 127                         printf("ERROR: remainder too large.\n");
 128                         exit(1);
 129                 }
 130
 131                 memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
 132         }
 133
 134         mBGZF.BlockOffset = remaining;
 135         return compressedLength;
 136 }
 137
 138 // flushes the data in the BGZF block
 139 void BamWriter::BgzfFlushBlock(void) {
 140
 141         // flush all of the remaining blocks
 142         while(mBGZF.BlockOffset > 0) {
 143
 144                 // compress the data block
 145                 int blockLength = BgzfDeflateBlock();
 146
 147                 // flush the data to our output stream
 148                 int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
 149
 150                 if(numBytesWritten != blockLength) {
 151                         printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
 152                         exit(1);
 153                 }
 154
 155                 mBGZF.BlockAddress += blockLength;
 156         }
 157 }
 158
 159 // opens the BAM file for writing
 160 void BamWriter::BgzfOpen(const string& filename) {
 161
 162         mBGZF.Stream = fopen(filename.c_str(), "wb");
 163
 164         if(!mBGZF.Stream) {
 165                 printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
 166                 exit(1);
 167         }
 168
 169         mBGZF.IsOpen = true;
 170 }
 171
 172 // writes the supplied data into the BGZF buffer
 173 unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
 174
 175         // initialize
 176         unsigned int numBytesWritten = 0;
 177         const char* input = data;
 178         unsigned int blockLength = mBGZF.UncompressedBlockSize;
 179
 180         // copy the data to the buffer
 181         while(numBytesWritten < dataLen) {
 182                 unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
 183                 char* buffer = mBGZF.UncompressedBlock;
 184                 memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
 185
 186                 mBGZF.BlockOffset += copyLength;
 187                 input             += copyLength;
 188                 numBytesWritten   += copyLength;
 189
 190                 if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
 191         }
 192
 193         return numBytesWritten;
 194 }
 195
 196 // closes the alignment archive
 197 void BamWriter::Close(void) {
 198         if(mBGZF.IsOpen) BgzfClose();
 199 }
 200
 201 // creates a cigar string from the supplied alignment
 202 void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
 203
 204         // initialize
 205         const unsigned int numCigarOperations = cigarOperations.size();
 206         packedCigar.resize(numCigarOperations * SIZEOF_INT);
 207
 208         // pack the cigar data into the string
 209         unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
 210
 211         unsigned int cigarOp;
 212         vector<CigarOp>::const_iterator coIter;
 213         for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
 214
 215                 switch(coIter->Type) {
 216           case 'M':
 217                   cigarOp = BAM_CMATCH;
 218                   break;
 219           case 'I':
 220                   cigarOp = BAM_CINS;
 221                   break;
 222           case 'D':
 223                   cigarOp = BAM_CDEL;
 224                   break;
 225           case 'N':
 226                   cigarOp = BAM_CREF_SKIP;
 227                   break;
 228           case 'S':
 229                   cigarOp = BAM_CSOFT_CLIP;
 230                   break;
 231           case 'H':
 232                   cigarOp = BAM_CHARD_CLIP;
 233                   break;
 234           case 'P':
 235                   cigarOp = BAM_CPAD;
 236                   break;
 237           default:
 238                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
 239                   exit(1);
 240                 }
 241
 242                 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
 243                 pPackedCigar++;
 244         }
 245 }
 246
 247 // encodes the supplied query sequence into 4-bit notation
 248 void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
 249
 250         // prepare the encoded query string
 251         const unsigned int queryLen = query.size();
 252         const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
 253         encodedQuery.resize(encodedQueryLen);
 254         char* pEncodedQuery = (char*)encodedQuery.data();
 255         const char* pQuery = (const char*)query.data();
 256
 257         unsigned char nucleotideCode;
 258         bool useHighWord = true;
 259
 260         while(*pQuery) {
 261
 262                 switch(*pQuery) {
 263                         case '=':
 264                                 nucleotideCode = 0;
 265                                 break;
 266                         case 'A':
 267                                 nucleotideCode = 1;
 268                                 break;
 269                         case 'C':
 270                                 nucleotideCode = 2;
 271                                 break;
 272                         case 'G':
 273                                 nucleotideCode = 4;
 274                                 break;
 275                         case 'T':
 276                                 nucleotideCode = 8;
 277                                 break;
 278                         case 'N':
 279                                 nucleotideCode = 15;
 280                                 break;
 281                         default:
 282                                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
 283                                 exit(1);
 284                 }
 285
 286                 // pack the nucleotide code
 287                 if(useHighWord) {
 288                         *pEncodedQuery = nucleotideCode << 4;
 289                         useHighWord = false;
 290                 } else {
 291                         *pEncodedQuery |= nucleotideCode;
 292                         pEncodedQuery++;
 293                         useHighWord = true;
 294                 }
 295
 296                 // increment the query position
 297                 pQuery++;
 298         }
 299 }
 300
 301 // opens the alignment archive
 302 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
 303
 304         // open the BGZF file for writing
 305         BgzfOpen(filename);
 306
 307         // ================
 308         // write the header
 309         // ================
 310
 311         // write the BAM signature
 312         const unsigned char SIGNATURE_LENGTH = 4;
 313         const char* BAM_SIGNATURE = "BAM\1";
 314         BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
 315
 316         // write the SAM header text length
 317         const unsigned int samHeaderLen = samHeader.size();
 318         BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
 319
 320         // write the SAM header text
 321         if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
 322
 323         // write the number of reference sequences
 324         const unsigned int numReferenceSequences = referenceSequences.size();
 325         BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
 326
 327         // =============================
 328         // write the sequence dictionary
 329         // =============================
 330
 331         RefVector::const_iterator rsIter;
 332         for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
 333
 334                 // write the reference sequence name length
 335                 const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
 336                 BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
 337
 338                 // write the reference sequence name
 339                 BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
 340
 341                 // write the reference sequence length
 342                 BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
 343         }
 344 }
 345
 346 // saves the alignment to the alignment archive
 347 void BamWriter::SaveAlignment(const BamAlignment& al) {
 348
 349         // initialize
 350         const unsigned int nameLen            = al.Name.size() + 1;
 351         const unsigned int queryLen           = al.QueryBases.size();
 352         const unsigned int numCigarOperations = al.CigarData.size();
 353
 354         // create our packed cigar string
 355         string packedCigar;
 356         CreatePackedCigar(al.CigarData, packedCigar);
 357         const unsigned int packedCigarLen = packedCigar.size();
 358
 359         // encode the query
 360         string encodedQuery;
 361         EncodeQuerySequence(al.QueryBases, encodedQuery);
 362         const unsigned int encodedQueryLen = encodedQuery.size();
 363
 364         // store the tag data length
 365         const unsigned int tagDataLength = al.TagData.size() + 1;
 366
 367         // assign the BAM core data
 368         unsigned int buffer[8];
 369         buffer[0] = al.RefID;
 370         buffer[1] = al.Position;
 371         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
 372         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
 373         buffer[4] = queryLen;
 374         buffer[5] = al.MateRefID;
 375         buffer[6] = al.MatePosition;
 376         buffer[7] = al.InsertSize;
 377
 378         // write the block size
 379         const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
 380         const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
 381
 382         BgzfWrite((char*)&blockSize, SIZEOF_INT);
 383
 384         // write the BAM core
 385         BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
 386
 387         // write the query name
 388         BgzfWrite(al.Name.c_str(), nameLen);
 389
 390         // write the packed cigar
 391         BgzfWrite(packedCigar.data(), packedCigarLen);
 392
 393         // write the encoded query sequence
 394         BgzfWrite(encodedQuery.data(), encodedQueryLen);
 395
 396         // write the base qualities
 397         string baseQualities = al.Qualities;
 398         char* pBaseQualities = (char*)al.Qualities.data();
 399         for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
 400         BgzfWrite(pBaseQualities, queryLen);
 401
 402         // write the read group tag
 403         BgzfWrite(al.TagData.data(), tagDataLength);
 404 }