BamReader.cpp

   1 // ***************************************************************************
   2 // BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
   3 // Marth Lab, Department of Biology, Boston College
   4 // All rights reserved.
   5 // ---------------------------------------------------------------------------
   6 // Last modified: 15 July 2009 (DB)
   7 // ---------------------------------------------------------------------------
   8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
   9 // Institute.
  10 // ---------------------------------------------------------------------------
  11 // Provides the basic functionality for reading BAM files
  12 // ***************************************************************************
  13
  14 // BamTools includes
  15 #include "BamReader.h"
  16 using namespace BamTools;
  17 using namespace std;
  18
  19 // static character constants
  20 const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
  21 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
  22
  23 // constructor
  24 BamReader::BamReader(void)
  25         : m_index(NULL)
  26         , m_isIndexLoaded(false)
  27         , m_alignmentsBeginOffset(0)
  28         , m_isRegionSpecified(false)
  29         , m_currentRefID(0)
  30         , m_currentLeft(0)
  31 { }
  32
  33 // destructor
  34 BamReader::~BamReader(void) {
  35         Close();
  36 }
  37
  38 // checks BGZF block header
  39 bool BamReader::BgzfCheckBlockHeader(char* header) {
  40
  41         return (header[0] == GZIP_ID1 &&
  42             header[1] == (char)GZIP_ID2 &&
  43             header[2] == Z_DEFLATED &&
  44             (header[3] & FLG_FEXTRA) != 0 &&
  45             BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
  46             header[12] == BGZF_ID1 &&
  47             header[13] == BGZF_ID2 &&
  48             BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
  49                    );
  50 }
  51
  52 // closes the BAM file
  53 void BamReader::BgzfClose(void) {
  54         m_BGZF.IsOpen = false;
  55         fclose(m_BGZF.Stream);
  56 }
  57
  58 // de-compresses the current block
  59 int BamReader::BgzfInflateBlock(int blockLength) {
  60
  61         // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
  62     z_stream zs;
  63     zs.zalloc    = NULL;
  64     zs.zfree     = NULL;
  65     zs.next_in   = (Bytef*)m_BGZF.CompressedBlock + 18;
  66     zs.avail_in  = blockLength - 16;
  67     zs.next_out  = (Bytef*)m_BGZF.UncompressedBlock;
  68     zs.avail_out = m_BGZF.UncompressedBlockSize;
  69
  70     int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
  71     if (status != Z_OK) {
  72         printf("inflateInit failed\n");
  73         exit(1);
  74     }
  75
  76     status = inflate(&zs, Z_FINISH);
  77     if (status != Z_STREAM_END) {
  78         inflateEnd(&zs);
  79         printf("inflate failed\n");
  80         exit(1);
  81     }
  82
  83     status = inflateEnd(&zs);
  84     if (status != Z_OK) {
  85         printf("inflateEnd failed\n");
  86         exit(1);
  87     }
  88
  89     return zs.total_out;
  90 }
  91
  92 // opens the BAM file for reading
  93 void BamReader::BgzfOpen(const string& filename) {
  94
  95         m_BGZF.Stream = fopen(filename.c_str(), "rb");
  96         if(!m_BGZF.Stream) {
  97                 printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
  98                 exit(1);
  99         }
 100
 101         m_BGZF.IsOpen = true;
 102 }
 103
 104 // reads BGZF data into buffer
 105 unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
 106
 107     if (dataLength == 0) { return 0; }
 108
 109         char* output = data;
 110     unsigned int numBytesRead = 0;
 111     while (numBytesRead < dataLength) {
 112
 113         int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
 114         if (bytesAvailable <= 0) {
 115             if ( BgzfReadBlock() != 0 ) { return -1; }
 116             bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
 117             if ( bytesAvailable <= 0 ) { break; }
 118         }
 119
 120                 char* buffer   = m_BGZF.UncompressedBlock;
 121         int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
 122         memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
 123
 124         m_BGZF.BlockOffset += copyLength;
 125         output             += copyLength;
 126         numBytesRead       += copyLength;
 127     }
 128
 129     if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
 130                 m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);
 131         m_BGZF.BlockOffset  = 0;
 132         m_BGZF.BlockLength  = 0;
 133     }
 134
 135         return numBytesRead;
 136 }
 137
 138 int BamReader::BgzfReadBlock(void) {
 139
 140     char    header[BLOCK_HEADER_LENGTH];
 141     int64_t blockAddress = ftello(m_BGZF.Stream);
 142
 143     int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
 144         if (count == 0) {
 145         m_BGZF.BlockLength = 0;
 146         return 0;
 147     }
 148
 149     if (count != sizeof(header)) {
 150         printf("read block failed - count != sizeof(header)\n");
 151         return -1;
 152     }
 153
 154     if (!BgzfCheckBlockHeader(header)) {
 155         printf("read block failed - CheckBgzfBlockHeader() returned false\n");
 156         return -1;
 157     }
 158
 159     int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
 160     char* compressedBlock = m_BGZF.CompressedBlock;
 161     memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
 162     int remaining = blockLength - BLOCK_HEADER_LENGTH;
 163
 164     count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
 165     if (count != remaining) {
 166         printf("read block failed - count != remaining\n");
 167         return -1;
 168     }
 169
 170     count = BgzfInflateBlock(blockLength);
 171     if (count < 0) { return -1; }
 172
 173     if (m_BGZF.BlockLength != 0) {
 174         m_BGZF.BlockOffset = 0;
 175     }
 176
 177     m_BGZF.BlockAddress = blockAddress;
 178     m_BGZF.BlockLength  = count;
 179     return 0;
 180 }
 181
 182 // move file pointer to specified offset
 183 bool BamReader::BgzfSeek(int64_t position) {
 184
 185         int     blockOffset  = (position & 0xFFFF);
 186     int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
 187     if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
 188         printf("ERROR: Unable to seek in BAM file\n");
 189                 exit(1);
 190     }
 191
 192     m_BGZF.BlockLength  = 0;
 193     m_BGZF.BlockAddress = blockAddress;
 194     m_BGZF.BlockOffset  = blockOffset;
 195         return true;
 196 }
 197
 198 // get file position in BAM file
 199 int64_t BamReader::BgzfTell(void) {
 200         return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
 201 }
 202
 203 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
 204
 205         // get region boundaries
 206         uint32_t begin = left;
 207         uint32_t end   = m_references.at(refID).RefLength - 1;
 208
 209         // initialize list, bin '0' always a valid bin
 210         int i = 0;
 211         list[i++] = 0;
 212
 213         // get rest of bins that contain this region
 214         unsigned int k;
 215         for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
 216         for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
 217         for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
 218         for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
 219         for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
 220
 221         // return number of bins stored
 222         return i;
 223 }
 224
 225 unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
 226
 227         // initialize alignment end to starting position
 228         unsigned int alignEnd = position;
 229
 230         // iterate over cigar operations
 231         vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
 232         vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
 233         for ( ; cigarIter != cigarEnd; ++cigarIter) {
 234                 char cigarType = (*cigarIter).Type;
 235                 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
 236                         alignEnd += (*cigarIter).Length;
 237                 }
 238         }
 239         return alignEnd;
 240 }
 241
 242 void BamReader::ClearIndex(void) {
 243
 244         if ( m_index ) {
 245                 // iterate over references
 246                 vector<RefIndex*>::iterator refIter = m_index->begin();
 247                 vector<RefIndex*>::iterator refEnd  = m_index->end();
 248                 for ( ; refIter != refEnd; ++refIter) {
 249                         RefIndex* aRef = (*refIter);
 250                         if ( aRef ) {
 251                                 // clear out BAM bins
 252                                 if ( aRef->first ) {
 253                                         BinVector::iterator binIter = (aRef->first)->begin();
 254                                         BinVector::iterator binEnd  = (aRef->first)->end();
 255                                         for ( ; binIter != binEnd; ++binIter ) {
 256                                                 ChunkVector* chunks = (*binIter).second;
 257                                                 if ( chunks ) { delete chunks; chunks = NULL;}
 258                                         }
 259                                         delete aRef->first;
 260                                         aRef->first = NULL;
 261                                 }
 262                                 // clear BAM linear offsets
 263                                 if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
 264                                 delete aRef;
 265                                 aRef = NULL;
 266                         }
 267                 }
 268                 delete m_index;
 269                 m_index = NULL;
 270         }
 271 }
 272
 273 // closes the BAM file
 274 void BamReader::Close(void) {
 275         if(m_BGZF.IsOpen) { BgzfClose(); }
 276         ClearIndex();
 277         m_headerText.clear();
 278         m_isRegionSpecified = false;
 279 }
 280
 281 const string BamReader::GetHeaderText(void) const {
 282         return m_headerText;
 283 }
 284
 285 const int BamReader::GetReferenceCount(void) const {
 286         return m_references.size();
 287 }
 288
 289 const RefVector BamReader::GetReferenceData(void) const {
 290         return m_references;
 291 }
 292
 293 const int BamReader::GetReferenceID(const string& refName) const {
 294
 295         // retrieve names from reference data
 296         vector<string> refNames;
 297         RefVector::const_iterator refIter = m_references.begin();
 298     RefVector::const_iterator refEnd  = m_references.end();
 299     for ( ; refIter != refEnd; ++refIter) {
 300                 refNames.push_back( (*refIter).RefName );
 301     }
 302
 303         // return 'index-of' refName ( if not found, returns refNames.size() )
 304         return Index( refNames.begin(), refNames.end(), refName );
 305 }
 306
 307 // get next alignment (from specified region, if given)
 308 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
 309
 310         // if valid alignment available
 311         if ( LoadNextAlignment(bAlignment) ) {
 312
 313                 // if region not specified, return success
 314                 if ( !m_isRegionSpecified ) { return true; }
 315
 316                 // load next alignment until region overlap is found
 317                 while ( !IsOverlap(bAlignment) ) {
 318                         // if no valid alignment available (likely EOF) return failure
 319                         if ( !LoadNextAlignment(bAlignment) ) { return false; }
 320                 }
 321
 322                 // return success (alignment found that overlaps region)
 323                 return true;
 324         }
 325
 326         // no valid alignment
 327         else { return false; }
 328 }
 329
 330 int64_t BamReader::GetOffset(int refID, unsigned int left) {
 331
 332         // calculate which bins overlap this region
 333         uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
 334         int numBins = BinsFromRegion(refID, left, bins);
 335
 336         // get bins for this reference
 337         RefIndex* refIndex = m_index->at(refID);
 338         BinVector* refBins = refIndex->first;
 339
 340         // get minimum offset to consider
 341         LinearOffsetVector* linearOffsets = refIndex->second;
 342         uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
 343
 344         // store offsets to beginning of alignment 'chunks'
 345         std::vector<int64_t> chunkStarts;
 346
 347         // reference bin iterators
 348         BinVector::const_iterator binIter;
 349         BinVector::const_iterator binBegin = refBins->begin();
 350         BinVector::const_iterator binEnd   = refBins->end();
 351
 352         // store all alignment 'chunk' starts for bins in this region
 353         for (int i = 0; i < numBins; ++i ) {
 354                 binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
 355                 if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
 356                         ChunkVector* binChunks = (*binIter).second;
 357                         ChunkVector::const_iterator chunkIter = binChunks->begin();
 358                         ChunkVector::const_iterator chunkEnd  = binChunks->end();
 359                         for ( ; chunkIter != chunkEnd; ++chunkIter) {
 360                                 if ( (*chunkIter).second > minOffset ) {
 361                                         chunkStarts.push_back( (*chunkIter).first );
 362                                 }
 363                         }
 364                 }
 365         }
 366
 367         // clean up memory
 368         free(bins);
 369
 370         // if no alignments found
 371         if ( chunkStarts.empty() ) { return -1; }
 372
 373         // else return smallest offset for alignment starts
 374         else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
 375 }
 376
 377 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
 378
 379         // if on different reference sequence, quit
 380         if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
 381
 382         // read starts after left boundary
 383         if ( bAlignment.Position >= m_currentLeft) { return true; }
 384
 385         // return whether alignment end overlaps left boundary
 386         return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
 387 }
 388
 389 bool BamReader::Jump(int refID, unsigned int position) {
 390
 391         // if index available, and region is valid
 392         if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) {
 393                 m_currentRefID = refID;
 394                 m_currentLeft  = position;
 395                 m_isRegionSpecified = true;
 396
 397                 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
 398                 if ( offset == -1 ) { return false; }
 399                 else { return BgzfSeek(offset); }
 400         }
 401         return false;
 402 }
 403
 404 void BamReader::LoadHeaderData(void) {
 405
 406         // check to see if proper BAM header
 407         char buffer[4];
 408         if (BgzfRead(buffer, 4) != 4) {
 409                 printf("Could not read header type\n");
 410                 exit(1);
 411         }
 412         if (strncmp(buffer, "BAM\001", 4)) {
 413                 printf("wrong header type!\n");
 414                 exit(1);
 415         }
 416
 417         // get BAM header text length
 418         BgzfRead(buffer, 4);
 419         const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
 420
 421         // get BAM header text
 422         char* headerText = (char*)calloc(headerTextLength + 1, 1);
 423         BgzfRead(headerText, headerTextLength);
 424         m_headerText = (string)((const char*)headerText);
 425
 426         // clean up calloc-ed temp variable
 427         free(headerText);
 428 }
 429
 430 void BamReader::LoadIndexData(FILE* indexStream) {
 431
 432         // see if index is valid BAM index
 433         char magic[4];
 434         fread(magic, 1, 4, indexStream);
 435         if (strncmp(magic, "BAI\1", 4)) {
 436                 printf("Problem with index file - invalid format.\n");
 437                 fclose(indexStream);
 438                 exit(1);
 439         }
 440
 441         // get number of reference sequences
 442         uint32_t numRefSeqs;
 443         fread(&numRefSeqs, 4, 1, indexStream);
 444
 445         // intialize BamIndex data structure
 446         m_index = new BamIndex;
 447         m_index->reserve(numRefSeqs);
 448
 449         // iterate over reference sequences
 450         for (unsigned int i = 0; i < numRefSeqs; ++i) {
 451
 452                 // get number of bins for this reference sequence
 453                 int32_t numBins;
 454                 fread(&numBins, 4, 1, indexStream);
 455
 456                 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
 457
 458                 // intialize BinVector
 459                 BinVector* bins = new BinVector;
 460                 bins->reserve(numBins);
 461
 462                 // iterate over bins for that reference sequence
 463                 for (int j = 0; j < numBins; ++j) {
 464
 465                         // get binID
 466                         uint32_t binID;
 467                         fread(&binID, 4, 1, indexStream);
 468
 469                         // get number of regionChunks in this bin
 470                         uint32_t numChunks;
 471                         fread(&numChunks, 4, 1, indexStream);
 472
 473                         // intialize ChunkVector
 474                         ChunkVector* regionChunks = new ChunkVector;
 475                         regionChunks->reserve(numChunks);
 476
 477                         // iterate over regionChunks in this bin
 478                         for (unsigned int k = 0; k < numChunks; ++k) {
 479
 480                                 // get chunk boundaries (left, right)
 481                                 uint64_t left;
 482                                 uint64_t right;
 483                                 fread(&left, 8, 1, indexStream);
 484                                 fread(&right, 8, 1, indexStream);
 485
 486                                 // save ChunkPair
 487                                 regionChunks->push_back( ChunkPair(left, right) );
 488                         }
 489
 490                         // sort chunks for this bin
 491                         sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
 492
 493                         // save binID, chunkVector for this bin
 494                         bins->push_back( BamBin(binID, regionChunks) );
 495                 }
 496
 497                 // sort bins by binID
 498                 sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
 499
 500                 // load linear index for this reference sequence
 501
 502                 // get number of linear offsets
 503                 int32_t numLinearOffsets;
 504                 fread(&numLinearOffsets, 4, 1, indexStream);
 505
 506                 // intialize LinearOffsetVector
 507                 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
 508                 linearOffsets->reserve(numLinearOffsets);
 509
 510                 // iterate over linear offsets for this reference sequeence
 511                 for (int j = 0; j < numLinearOffsets; ++j) {
 512                         // get a linear offset
 513                         uint64_t linearOffset;
 514                         fread(&linearOffset, 8, 1, indexStream);
 515                         // store linear offset
 516                         linearOffsets->push_back(linearOffset);
 517                 }
 518
 519                 // sort linear offsets
 520                 sort( linearOffsets->begin(), linearOffsets->end() );
 521
 522                 // store index data for that reference sequence
 523                 m_index->push_back( new RefIndex(bins, linearOffsets) );
 524         }
 525
 526         // close index file (.bai) and return
 527         fclose(indexStream);
 528 }
 529
 530 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
 531
 532         // read in the 'block length' value, make sure it's not zero
 533         char buffer[4];
 534         BgzfRead(buffer, 4);
 535         const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
 536         if ( blockLength == 0 ) { return false; }
 537
 538         // keep track of bytes read as method progresses
 539         int bytesRead = 4;
 540
 541         // read in core alignment data, make sure the right size of data was read
 542         char x[BAM_CORE_SIZE];
 543         if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
 544         bytesRead += BAM_CORE_SIZE;
 545
 546         // set BamAlignment 'core' data and character data lengths
 547         unsigned int tempValue;
 548         unsigned int queryNameLength;
 549         unsigned int numCigarOperations;
 550         unsigned int querySequenceLength;
 551
 552         bAlignment.RefID    = BgzfUnpackUnsignedInt(&x[0]);
 553         bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
 554
 555         tempValue             = BgzfUnpackUnsignedInt(&x[8]);
 556         bAlignment.Bin        = tempValue >> 16;
 557         bAlignment.MapQuality = tempValue >> 8 & 0xff;
 558         queryNameLength       = tempValue & 0xff;
 559
 560         tempValue                = BgzfUnpackUnsignedInt(&x[12]);
 561         bAlignment.AlignmentFlag = tempValue >> 16;
 562         numCigarOperations       = tempValue & 0xffff;
 563
 564         querySequenceLength     = BgzfUnpackUnsignedInt(&x[16]);
 565         bAlignment.MateRefID    = BgzfUnpackUnsignedInt(&x[20]);
 566         bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
 567         bAlignment.InsertSize   = BgzfUnpackUnsignedInt(&x[28]);
 568
 569         // calculate lengths/offsets
 570         const unsigned int dataLength      = blockLength - BAM_CORE_SIZE;
 571         const unsigned int cigarDataOffset = queryNameLength;
 572         const unsigned int seqDataOffset   = cigarDataOffset + (numCigarOperations * 4);
 573         const unsigned int qualDataOffset  = seqDataOffset + (querySequenceLength+1)/2;
 574         const unsigned int tagDataOffset   = qualDataOffset + querySequenceLength;
 575         const unsigned int tagDataLen      = dataLength - tagDataOffset;
 576
 577         // set up destination buffers for character data
 578         char* allCharData   = (char*)calloc(sizeof(char), dataLength);
 579         uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
 580         char* seqData       = ((char*)allCharData) + seqDataOffset;
 581         char* qualData      = ((char*)allCharData) + qualDataOffset;
 582         char* tagData       = ((char*)allCharData) + tagDataOffset;
 583
 584         // get character data - make sure proper data size was read
 585         if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
 586         else {
 587
 588                 bytesRead += dataLength;
 589
 590                 // clear out any previous string data
 591                 bAlignment.Name.clear();
 592                 bAlignment.QueryBases.clear();
 593                 bAlignment.Qualities.clear();
 594                 bAlignment.AlignedBases.clear();
 595                 bAlignment.CigarData.clear();
 596                 bAlignment.TagData.clear();
 597
 598                 // save name
 599                 bAlignment.Name = (string)((const char*)(allCharData));
 600
 601                 // save query sequence
 602                 for (unsigned int i = 0; i < querySequenceLength; ++i) {
 603                         char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
 604                         bAlignment.QueryBases.append( 1, singleBase );
 605                 }
 606
 607                 // save sequence length
 608                 bAlignment.Length = bAlignment.QueryBases.length();
 609
 610                 // save qualities
 611                 for (unsigned int i = 0; i < querySequenceLength; ++i) {
 612                         char singleQuality = (char)(qualData[i]+33);                    // conversion from QV to FASTQ character
 613                         bAlignment.Qualities.append( 1, singleQuality );
 614                 }
 615
 616                 // save CIGAR-related data;
 617                 int k = 0;
 618                 for (unsigned int i = 0; i < numCigarOperations; ++i) {
 619
 620                         // build CigarOp struct
 621                         CigarOp op;
 622                         op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
 623                         op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
 624
 625                         // save CigarOp
 626                         bAlignment.CigarData.push_back(op);
 627
 628                         // build AlignedBases string
 629                         switch (op.Type) {
 630
 631                                 case ('M') :
 632                                 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
 633                                 case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
 634                                                          break;
 635
 636                                 case ('D') : bAlignment.AlignedBases.append( op.Length, '-' );  // for 'D' - write gap character
 637                                                          break;
 638
 639                                 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'P' - write padding character;
 640                                                          break;
 641
 642                                 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
 643                                                          k += op.Length;
 644                                                          break;
 645
 646                                 case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
 647
 648                                 default    : printf("ERROR: Invalid Cigar op type\n");                  // shouldn't get here
 649                                                          exit(1);
 650                         }
 651                 }
 652
 653                 // read in the tag data
 654                 bAlignment.TagData.resize(tagDataLen);
 655                 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
 656         }
 657
 658         free(allCharData);
 659         return true;
 660 }
 661
 662 void BamReader::LoadReferenceData(void) {
 663
 664         // get number of reference sequences
 665         char buffer[4];
 666         BgzfRead(buffer, 4);
 667         const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
 668         if (numberRefSeqs == 0) { return; }
 669         m_references.reserve((int)numberRefSeqs);
 670
 671         // iterate over all references in header
 672         for (unsigned int i = 0; i != numberRefSeqs; ++i) {
 673
 674                 // get length of reference name
 675                 BgzfRead(buffer, 4);
 676                 const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
 677                 char* refName = (char*)calloc(refNameLength, 1);
 678
 679                 // get reference name and reference sequence length
 680                 BgzfRead(refName, refNameLength);
 681                 BgzfRead(buffer, 4);
 682                 const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
 683
 684                 // store data for reference
 685                 RefData aReference;
 686                 aReference.RefName   = (string)((const char*)refName);
 687                 aReference.RefLength = refLength;
 688                 m_references.push_back(aReference);
 689
 690                 // clean up calloc-ed temp variable
 691                 free(refName);
 692         }
 693 }
 694
 695 // opens BAM file (and index)
 696 void BamReader::Open(const string& filename, const string& indexFilename) {
 697
 698         // open the BGZF file for reading, retrieve header text & reference data
 699         BgzfOpen(filename);
 700         LoadHeaderData();
 701         LoadReferenceData();
 702
 703         // store file offset of first alignment
 704         m_alignmentsBeginOffset = BgzfTell();
 705
 706         // open index file & load index data (if exists)
 707         OpenIndex(indexFilename);
 708 }
 709
 710 void BamReader::OpenIndex(const string& indexFilename) {
 711
 712         // if index file exists
 713         if (!indexFilename.empty()) {
 714
 715                 // open index
 716                 FILE* indexStream = fopen(indexFilename.c_str(), "rb");
 717
 718                 // abort on error
 719                 if(!indexStream) {
 720                         printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
 721                         exit(1);
 722                 }
 723
 724                 // build up index data structure
 725                 LoadIndexData(indexStream);
 726         }
 727 }
 728
 729 bool BamReader::Rewind(void) {
 730
 731         // find first reference that has alignments in the BAM file
 732         int refID = 0;
 733         int refCount = m_references.size();
 734         for ( ; refID < refCount; ++refID ) {
 735                 if ( m_references.at(refID).RefHasAlignments ) { break; }
 736         }
 737
 738         // store default bounds for first alignment
 739         m_currentRefID = refID;
 740         m_currentLeft = 0;
 741         m_isRegionSpecified = false;
 742
 743         // return success/failure of seek
 744         return BgzfSeek(m_alignmentsBeginOffset);
 745 }