]> git.donarmstrong.com Git - bamtools.git/blobdiff - BamReader.cpp
Minor formatting cleanup in BamIndex.*
[bamtools.git] / BamReader.cpp
index f2a1e11071dd8d16b25f7cd4fa8ced759bffc94c..b1b1b5de2ab79a76416bd5267f6dc5cc1ecccd02 100644 (file)
-// ***************************************************************************
-// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 15 July 2009 (DB)
-// ---------------------------------------------------------------------------
-// The BGZF routines were adapted from the bgzf.c code developed at the Broad
-// Institute.
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for reading BAM files
-// ***************************************************************************
-
-// BamTools includes
-#include "BamReader.h"
-using namespace BamTools;
-using namespace std;
-
-// static character constants
-const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
-const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
-
-// constructor
-BamReader::BamReader(void)
-       : m_BGZF(NULL)
-       , m_index(NULL)
-       , m_isIndexLoaded(false)
-       , m_alignmentsBeginOffset(0)
-       , m_isRegionSpecified(false)
-       , m_currentRefID(0)
-       , m_currentLeft(0)
-{ }
-
-// destructor
-BamReader::~BamReader(void) {
-       Close();
-}
-
-// checks BGZF block header
-bool BamReader::BgzfCheckBlockHeader(char* header) {
-
-       return (header[0] == GZIP_ID1 &&
-            header[1] == (char)GZIP_ID2 &&
-            header[2] == Z_DEFLATED &&
-            (header[3] & FLG_FEXTRA) != 0 &&
-            BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
-            header[12] == BGZF_ID1 &&
-            header[13] == BGZF_ID2 &&
-            BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
-                  );
-}
-
-// closes the BAM file
-void BamReader::BgzfClose(void) {
-       fflush(m_BGZF->Stream);
-       fclose(m_BGZF->Stream);
-       m_BGZF->IsOpen = false;
-}
-
-// de-compresses the current block
-int BamReader::BgzfInflateBlock(int blockLength) {
-       
-       // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
-    z_stream zs;
-    zs.zalloc    = NULL;
-    zs.zfree     = NULL;
-    zs.next_in   = (Bytef*)m_BGZF->CompressedBlock + 18;
-    zs.avail_in  = blockLength - 16;
-    zs.next_out  = (Bytef*)m_BGZF->UncompressedBlock;
-    zs.avail_out = m_BGZF->UncompressedBlockSize;
-
-    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
-    if (status != Z_OK) {
-        printf("inflateInit failed\n");
-        exit(1);
-    }
-
-    status = inflate(&zs, Z_FINISH);
-    if (status != Z_STREAM_END) {
-        inflateEnd(&zs);
-        printf("inflate failed\n");
-        exit(1);
-    }
-
-    status = inflateEnd(&zs);
-    if (status != Z_OK) {
-        printf("inflateEnd failed\n");
-        exit(1);
-    }
-
-    return zs.total_out;
-}
-
-// opens the BAM file for reading
-void BamReader::BgzfOpen(const string& filename) {
-
-       m_BGZF->Stream = fopen(filename.c_str(), "rb");
-       if(!m_BGZF->Stream) {
-               printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
-               exit(1);
-       }
-
-       m_BGZF->IsOpen = true;
-}
-
-// reads BGZF data into buffer
-unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
-
-    if (dataLength == 0) { return 0; }
-
-       char* output = data;
-       unsigned int numBytesRead = 0;
-       while (numBytesRead < dataLength) {
-
-        int bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
-        if (bytesAvailable <= 0) {
-            if ( BgzfReadBlock() != 0 ) { return -1; }
-            bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
-            if ( bytesAvailable <= 0 ) { break; }
-        }
-
-       char* buffer   = m_BGZF->UncompressedBlock;
-        int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
-        memcpy(output, buffer + m_BGZF->BlockOffset, copyLength);
-
-        m_BGZF->BlockOffset += copyLength;
-        output             += copyLength;
-        numBytesRead       += copyLength;
-    }
-
-    if ( m_BGZF->BlockOffset == m_BGZF->BlockLength ) {
-       m_BGZF->BlockAddress = ftello(m_BGZF->Stream);                                          
-        m_BGZF->BlockOffset  = 0;
-        m_BGZF->BlockLength  = 0;
-    }
-
-       return numBytesRead;
-}
-
-int BamReader::BgzfReadBlock(void) {
-
-    char    header[BLOCK_HEADER_LENGTH];
-    int64_t blockAddress = ftello(m_BGZF->Stream);
-
-    int count = fread(header, 1, sizeof(header), m_BGZF->Stream);
-       if (count == 0) {
-        m_BGZF->BlockLength = 0;
-        return 0;
-    }
-
-    if (count != sizeof(header)) {
-        printf("read block failed - count != sizeof(header)\n");
-        return -1;
-    }
-
-    if (!BgzfCheckBlockHeader(header)) {
-        printf("read block failed - CheckBgzfBlockHeader() returned false\n");
-        return -1;
-    }
-
-    int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
-    char* compressedBlock = m_BGZF->CompressedBlock;
-    memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
-    int remaining = blockLength - BLOCK_HEADER_LENGTH;
-
-    count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF->Stream);
-    if (count != remaining) {
-        printf("read block failed - count != remaining\n");
-        return -1;
-    }
-
-    count = BgzfInflateBlock(blockLength);
-    if (count < 0) { return -1; }
-
-    if (m_BGZF->BlockLength != 0) {
-        m_BGZF->BlockOffset = 0;
-    }
-
-    m_BGZF->BlockAddress = blockAddress;
-    m_BGZF->BlockLength  = count;
-    return 0;
-}
-
-// move file pointer to specified offset
-bool BamReader::BgzfSeek(int64_t position) {
-
-       int     blockOffset  = (position & 0xFFFF);
-       int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
-       if (fseeko(m_BGZF->Stream, blockAddress, SEEK_SET) != 0) {
-        printf("ERROR: Unable to seek in BAM file\n");
-               exit(1);
-       }
-
-       m_BGZF->BlockLength  = 0;
-       m_BGZF->BlockAddress = blockAddress;
-       m_BGZF->BlockOffset  = blockOffset;
-       return true;
-}
-
-// get file position in BAM file
-int64_t BamReader::BgzfTell(void) {
-       return ( (m_BGZF->BlockAddress << 16) | (m_BGZF->BlockOffset & 0xFFFF) );
-}
-
-int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
-
-       // get region boundaries
-       uint32_t begin = left;
-       uint32_t end   = m_references.at(refID).RefLength - 1;
-
-       // initialize list, bin '0' always a valid bin
-       int i = 0;
-       list[i++] = 0;
-
-       // get rest of bins that contain this region
-       unsigned int k;
-       for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
-       for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
-       for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
-       for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
-       for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
-       
-       // return number of bins stored
-       return i;
-}
-
-unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
-
-       // initialize alignment end to starting position
-       unsigned int alignEnd = position;
-
-       // iterate over cigar operations
-       vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
-       vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
-       for ( ; cigarIter != cigarEnd; ++cigarIter) {
-               char cigarType = (*cigarIter).Type;
-               if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
-                       alignEnd += (*cigarIter).Length;
-               }
-       }
-       return alignEnd;
-}
-
-void BamReader::ClearIndex(void) {
-
-       if ( m_index ) {
-               // iterate over references
-               vector<RefIndex*>::iterator refIter = m_index->begin();
-               vector<RefIndex*>::iterator refEnd  = m_index->end();
-               for ( ; refIter != refEnd; ++refIter) {
-                       RefIndex* aRef = (*refIter);
-                       if ( aRef ) {
-                               // clear out BAM bins
-                               if ( aRef->first ) {
-                                       BinVector::iterator binIter = (aRef->first)->begin();
-                                       BinVector::iterator binEnd  = (aRef->first)->end();
-                                       for ( ; binIter != binEnd; ++binIter ) {
-                                               ChunkVector* chunks = (*binIter).second;
-                                               if ( chunks ) { delete chunks; chunks = NULL;}
-                                       }
-                                       delete aRef->first;
-                                       aRef->first = NULL;
-                               }
-                               // clear BAM linear offsets
-                               if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
-                               delete aRef;
-                               aRef = NULL;
-                       }
-               }
-               delete m_index;
-               m_index = NULL;
-       }
-}
-
-// closes the BAM file
-void BamReader::Close(void) {
-       
-       if (m_BGZF!=NULL && m_BGZF->IsOpen) { 
-               BgzfClose();
-               delete m_BGZF;
-               m_BGZF = NULL; 
-       }       
-       ClearIndex();
-       m_headerText.clear();
-       m_isRegionSpecified = false;
-}
-
-const string BamReader::GetHeaderText(void) const {
-       return m_headerText;
-}
-
-const int BamReader::GetReferenceCount(void) const {
-       return m_references.size();
-}
-
-const RefVector BamReader::GetReferenceData(void) const {
-       return m_references;
-}
-
-const int BamReader::GetReferenceID(const string& refName) const {
-
-       // retrieve names from reference data
-       vector<string> refNames;
-       RefVector::const_iterator refIter = m_references.begin();
-    RefVector::const_iterator refEnd  = m_references.end();
-    for ( ; refIter != refEnd; ++refIter) {
-               refNames.push_back( (*refIter).RefName );
-    }
-
-       // return 'index-of' refName ( if not found, returns refNames.size() )
-       return Index( refNames.begin(), refNames.end(), refName );
-}
-
-// get next alignment (from specified region, if given)
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
-
-       // if valid alignment available
-       if ( LoadNextAlignment(bAlignment) ) {
-               
-               // if region not specified, return success
-               if ( !m_isRegionSpecified ) { return true; }
-               
-               // load next alignment until region overlap is found
-               while ( !IsOverlap(bAlignment) ) {
-                       // if no valid alignment available (likely EOF) return failure
-                       if ( !LoadNextAlignment(bAlignment) ) { return false; }
-               }
-               
-               // return success (alignment found that overlaps region)
-               return true;
-       } 
-       
-       // no valid alignment
-       else { return false; }
-}
-
-int64_t BamReader::GetOffset(int refID, unsigned int left) {
-
-       // calculate which bins overlap this region
-       uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
-       int numBins = BinsFromRegion(refID, left, bins);
-
-       // get bins for this reference
-       RefIndex* refIndex = m_index->at(refID);
-       BinVector* refBins = refIndex->first;
-
-       // get minimum offset to consider
-       LinearOffsetVector* linearOffsets = refIndex->second;
-       uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
-
-       // store offsets to beginning of alignment 'chunks' 
-       std::vector<int64_t> chunkStarts;
-       
-       // reference bin iterators
-       BinVector::const_iterator binIter;
-       BinVector::const_iterator binBegin = refBins->begin();
-       BinVector::const_iterator binEnd   = refBins->end();
-       
-       // store all alignment 'chunk' starts for bins in this region
-       for (int i = 0; i < numBins; ++i ) {
-               binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
-               if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
-                       ChunkVector* binChunks = (*binIter).second;
-                       ChunkVector::const_iterator chunkIter = binChunks->begin();
-                       ChunkVector::const_iterator chunkEnd  = binChunks->end();
-                       for ( ; chunkIter != chunkEnd; ++chunkIter) {
-                               if ( (*chunkIter).second > minOffset ) {
-                                       chunkStarts.push_back( (*chunkIter).first );
-                               }       
-                       }
-               }
-       }
-       
-       // clean up memory
-       free(bins);
-       
-       // if no alignments found
-       if ( chunkStarts.empty() ) { return -1; }
-       
-       // else return smallest offset for alignment starts 
-       else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
-}
-
-bool BamReader::IsOverlap(BamAlignment& bAlignment) {
-
-       // if on different reference sequence, quit
-       if ( bAlignment.RefID != m_currentRefID ) { return false; }
-
-       // read starts after left boundary
-       if ( bAlignment.Position >= (signed int) m_currentLeft) { return true; }
-
-       // return whether alignment end overlaps left boundary
-       return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
-}
-
-bool BamReader::Jump(int refID, unsigned int position) {
-
-       // if index available, and region is valid
-       if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) { 
-               m_currentRefID = refID;
-               m_currentLeft  = position;
-               m_isRegionSpecified = true;
-               
-               int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
-               if ( offset == -1 ) { return false; }
-               else { return BgzfSeek(offset); }
-       }
-       return false;
-}
-
-void BamReader::LoadHeaderData(void) {
-       
-       // check to see if proper BAM header
-       char buffer[4];
-       if (BgzfRead(buffer, 4) != 4) { 
-               printf("Could not read header type\n");
-               exit(1); 
-       }
-
-       if (strncmp(buffer, "BAM\001", 4)) {
-               printf("wrong header type!\n");
-               exit(1);
-       }
-       
-       // get BAM header text length
-       BgzfRead(buffer, 4);
-       const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
-
-       // get BAM header text
-       char* headerText = (char*)calloc(headerTextLength + 1, 1);
-       BgzfRead(headerText, headerTextLength);
-       m_headerText = (string)((const char*)headerText);
-       
-       // clean up calloc-ed temp variable
-       free(headerText);
-}
-
-void BamReader::LoadIndexData(FILE* indexStream) {
-
-       // see if index is valid BAM index
-       char magic[4];
-       fread(magic, 1, 4, indexStream);
-       if (strncmp(magic, "BAI\1", 4)) {
-               printf("Problem with index file - invalid format.\n");
-               fclose(indexStream);
-               exit(1);
-       }
-
-       // get number of reference sequences
-       uint32_t numRefSeqs;
-       fread(&numRefSeqs, 4, 1, indexStream);
-       
-       // intialize BamIndex data structure
-       m_index = new BamIndex;
-       m_index->reserve(numRefSeqs);
-
-       // iterate over reference sequences
-       for (unsigned int i = 0; i < numRefSeqs; ++i) {
-               
-               // get number of bins for this reference sequence
-               int32_t numBins;
-               fread(&numBins, 4, 1, indexStream);
-               
-               if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
-
-               // intialize BinVector
-               BinVector* bins = new BinVector;
-               bins->reserve(numBins);
-               
-               // iterate over bins for that reference sequence
-               for (int j = 0; j < numBins; ++j) {
-                       
-                       // get binID 
-                       uint32_t binID;
-                       fread(&binID, 4, 1, indexStream);
-                       
-                       // get number of regionChunks in this bin
-                       uint32_t numChunks;
-                       fread(&numChunks, 4, 1, indexStream);
-                       
-                       // intialize ChunkVector
-                       ChunkVector* regionChunks = new ChunkVector;
-                       regionChunks->reserve(numChunks);
-                       
-                       // iterate over regionChunks in this bin
-                       for (unsigned int k = 0; k < numChunks; ++k) {
-                               
-                               // get chunk boundaries (left, right) 
-                               uint64_t left;
-                               uint64_t right;
-                               fread(&left, 8, 1, indexStream);
-                               fread(&right, 8, 1, indexStream);
-                               
-                               // save ChunkPair
-                               regionChunks->push_back( ChunkPair(left, right) );
-                       }
-                       
-                       // sort chunks for this bin
-                       sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
-
-                       // save binID, chunkVector for this bin
-                       bins->push_back( BamBin(binID, regionChunks) );
-               }
-               
-               // sort bins by binID
-               sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
-
-               // load linear index for this reference sequence
-               
-               // get number of linear offsets
-               int32_t numLinearOffsets;
-               fread(&numLinearOffsets, 4, 1, indexStream);
-               
-               // intialize LinearOffsetVector
-               LinearOffsetVector* linearOffsets = new LinearOffsetVector;
-               linearOffsets->reserve(numLinearOffsets);
-               
-               // iterate over linear offsets for this reference sequeence
-               for (int j = 0; j < numLinearOffsets; ++j) {
-                       // get a linear offset
-                       uint64_t linearOffset;
-                       fread(&linearOffset, 8, 1, indexStream);
-                       // store linear offset
-                       linearOffsets->push_back(linearOffset);
-               }
-               
-               // sort linear offsets
-               sort( linearOffsets->begin(), linearOffsets->end() );
-
-               // store index data for that reference sequence
-               m_index->push_back( new RefIndex(bins, linearOffsets) );
-       }
-       
-       // close index file (.bai) and return
-       fclose(indexStream);
-}
-
-bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
-
-       // read in the 'block length' value, make sure it's not zero
-       char buffer[4];
-       BgzfRead(buffer, 4);
-       const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
-       if ( blockLength == 0 ) { return false; }
-
-       // keep track of bytes read as method progresses
-       int bytesRead = 4;
-
-       // read in core alignment data, make sure the right size of data was read 
-       char x[BAM_CORE_SIZE];
-       if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
-       bytesRead += BAM_CORE_SIZE;
-
-       // set BamAlignment 'core' data and character data lengths
-       unsigned int tempValue;
-       unsigned int queryNameLength;
-       unsigned int numCigarOperations;
-       unsigned int querySequenceLength;
-
-       //bAlignment.RefID    = BgzfUnpackUnsignedInt(&x[0]);
-       //bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
-       bAlignment.RefID    = BgzfUnpackSignedInt(&x[0]);
-       bAlignment.Position = BgzfUnpackSignedInt(&x[4]);
-
-       tempValue             = BgzfUnpackUnsignedInt(&x[8]);           
-       bAlignment.Bin        = tempValue >> 16;
-       bAlignment.MapQuality = tempValue >> 8 & 0xff;
-       queryNameLength       = tempValue & 0xff;
-
-       tempValue                = BgzfUnpackUnsignedInt(&x[12]);       
-       bAlignment.AlignmentFlag = tempValue >> 16;
-       numCigarOperations       = tempValue & 0xffff;
-
-       querySequenceLength     = BgzfUnpackUnsignedInt(&x[16]);
-       //bAlignment.MateRefID    = BgzfUnpackUnsignedInt(&x[20]);
-       //bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
-       //bAlignment.InsertSize   = BgzfUnpackUnsignedInt(&x[28]);
-       bAlignment.MateRefID    = BgzfUnpackSignedInt(&x[20]);
-       bAlignment.MatePosition = BgzfUnpackSignedInt(&x[24]);
-       bAlignment.InsertSize   = BgzfUnpackSignedInt(&x[28]);
-
-       // calculate lengths/offsets
-       const unsigned int dataLength      = blockLength - BAM_CORE_SIZE;
-       const unsigned int cigarDataOffset = queryNameLength;
-       const unsigned int seqDataOffset   = cigarDataOffset + (numCigarOperations * 4);
-       const unsigned int qualDataOffset  = seqDataOffset + (querySequenceLength+1)/2;
-       const unsigned int tagDataOffset   = qualDataOffset + querySequenceLength;
-       const unsigned int tagDataLen      = dataLength - tagDataOffset;
-       
-       // set up destination buffers for character data
-       char* allCharData   = (char*)calloc(sizeof(char), dataLength);
-       uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
-       char* seqData       = ((char*)allCharData) + seqDataOffset;
-       char* qualData      = ((char*)allCharData) + qualDataOffset;
-       char* tagData       = ((char*)allCharData) + tagDataOffset;
-       
-       // get character data - make sure proper data size was read
-       if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
-       else {
-               
-               bytesRead += dataLength;
-               
-               // clear out any previous string data
-               bAlignment.Name.clear();
-               bAlignment.QueryBases.clear();
-               bAlignment.Qualities.clear();
-               bAlignment.AlignedBases.clear();
-               bAlignment.CigarData.clear();
-               bAlignment.TagData.clear();
-               
-               // save name
-               bAlignment.Name = (string)((const char*)(allCharData));
-               
-               // save query sequence
-               for (unsigned int i = 0; i < querySequenceLength; ++i) {        
-                       char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
-                       bAlignment.QueryBases.append( 1, singleBase );
-               }
-               
-               // save sequence length
-               bAlignment.Length = bAlignment.QueryBases.length();
-               
-               // save qualities
-               for (unsigned int i = 0; i < querySequenceLength; ++i) { 
-                       char singleQuality = (char)(qualData[i]+33);                    // conversion from QV to FASTQ character
-                       bAlignment.Qualities.append( 1, singleQuality );
-               }
-               
-               // save CIGAR-related data;
-               int k = 0;
-               for (unsigned int i = 0; i < numCigarOperations; ++i) {
-                       
-                       // build CigarOp struct
-                       CigarOp op;
-                       op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
-                       op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
-                       
-                       // save CigarOp
-                       bAlignment.CigarData.push_back(op);
-                       
-                       // build AlignedBases string
-                       switch (op.Type) {
-                               
-                               case ('M') : 
-                               case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
-                               case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
-                                                        break;
-                                                        
-                               case ('D') : bAlignment.AlignedBases.append( op.Length, '-' );  // for 'D' - write gap character
-                                                        break;
-                                                       
-                               case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'P' - write padding character;
-                                                        break;
-                                                        
-                               case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
-                                                        k += op.Length;
-                                                        break;
-                                                        
-                               case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
-                                                        
-                               default    : printf("ERROR: Invalid Cigar op type\n");                  // shouldn't get here
-                                                        exit(1);
-                       }
-               }
-               
-               // read in the tag data
-               bAlignment.TagData.resize(tagDataLen);
-               memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
-       }
-
-       free(allCharData);
-       return true;
-}
-
-void BamReader::LoadReferenceData(void) {
-
-       // get number of reference sequences
-       char buffer[4];
-       BgzfRead(buffer, 4);
-       const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
-       if (numberRefSeqs == 0) { return; }
-       m_references.reserve((int)numberRefSeqs);
-       
-       // iterate over all references in header
-       for (unsigned int i = 0; i != numberRefSeqs; ++i) {
-
-               // get length of reference name
-               BgzfRead(buffer, 4);
-               const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
-               char* refName = (char*)calloc(refNameLength, 1);
-               
-               // get reference name and reference sequence length
-               BgzfRead(refName, refNameLength);
-               BgzfRead(buffer, 4);
-               const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
-               
-               // store data for reference
-               RefData aReference;
-               aReference.RefName   = (string)((const char*)refName);
-               aReference.RefLength = refLength;
-               m_references.push_back(aReference);
-               
-               // clean up calloc-ed temp variable
-               free(refName);
-       }
-}
-
-// opens BAM file (and index)
-void BamReader::Open(const string& filename, const string& indexFilename) {
-
-       // open the BGZF file for reading, retrieve header text & reference data
-       m_BGZF = new BgzfData;
-       BgzfOpen(filename);
-       LoadHeaderData();       
-       LoadReferenceData();
-
-       // store file offset of first alignment
-       m_alignmentsBeginOffset = BgzfTell();
-
-       // open index file & load index data (if exists)
-       OpenIndex(indexFilename);
-}
-
-void BamReader::OpenIndex(const string& indexFilename) {
-
-       // if index file exists
-       if (!indexFilename.empty()) {
-
-               // open index
-               FILE* indexStream = fopen(indexFilename.c_str(), "rb");
-               
-               // abort on error
-               if(!indexStream) {
-                       printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
-                       exit(1);
-               }
-       
-               // build up index data structure
-               LoadIndexData(indexStream);
-       }
-}
-
-bool BamReader::Rewind(void) {
-
-       // find first reference that has alignments in the BAM file
-       int refID = 0;
-       int refCount = m_references.size();
-       for ( ; refID < refCount; ++refID ) {
-               if ( m_references.at(refID).RefHasAlignments ) { break; } 
-       }
-
-       // store default bounds for first alignment
-       m_currentRefID = refID;
-       m_currentLeft = 0;
-       m_isRegionSpecified = false;
-
-       // return success/failure of seek
-       return BgzfSeek(m_alignmentsBeginOffset);
-}      
+// ***************************************************************************\r
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 15 July 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading BAM files\r
+// ***************************************************************************\r
+\r
+// C++ includes\r
+#include <algorithm>\r
+#include <iterator>\r
+#include <string>\r
+#include <vector>\r
+#include <iostream>\r
+\r
+// BamTools includes\r
+#include "BGZF.h"\r
+#include "BamReader.h"\r
+#include "BamIndex.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamReader::BamReaderPrivate {\r
+\r
+    // -------------------------------\r
+    // structs, enums, typedefs\r
+    // -------------------------------\r
+    enum RegionState { BEFORE_REGION = 0\r
+                     , WITHIN_REGION\r
+                     , AFTER_REGION\r
+                     };\r
+\r
+    // -------------------------------\r
+    // data members\r
+    // -------------------------------\r
+\r
+    // general file data\r
+    BgzfData  mBGZF;\r
+    string    HeaderText;\r
+    //BamIndex  Index;\r
+    BamIndex* NewIndex;\r
+    RefVector References;\r
+    bool      IsIndexLoaded;\r
+    int64_t   AlignmentsBeginOffset;\r
+    string    Filename;\r
+    string    IndexFilename;\r
+    \r
+    // system data\r
+    bool IsBigEndian;\r
+\r
+    // user-specified region values\r
+    BamRegion Region;\r
+    bool IsLeftBoundSpecified;\r
+    bool IsRightBoundSpecified;\r
+    \r
+    bool IsRegionSpecified;\r
+    int  CurrentRefID;\r
+    int  CurrentLeft;\r
+\r
+    // parent BamReader\r
+    BamReader* Parent;\r
+    \r
+    // BAM character constants\r
+    const char* DNA_LOOKUP;\r
+    const char* CIGAR_LOOKUP;\r
+\r
+    // -------------------------------\r
+    // constructor & destructor\r
+    // -------------------------------\r
+    BamReaderPrivate(BamReader* parent);\r
+    ~BamReaderPrivate(void);\r
+\r
+    // -------------------------------\r
+    // "public" interface\r
+    // -------------------------------\r
+\r
+    // file operations\r
+    void Close(void);\r
+    bool Jump(int refID, int position = 0);\r
+    bool Open(const string& filename, const string& indexFilename = "");\r
+    bool Rewind(void);\r
+    bool SetRegion(const BamRegion& region);\r
+\r
+    // access alignment data\r
+    bool GetNextAlignment(BamAlignment& bAlignment);\r
+    bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
+\r
+    // access auxiliary data\r
+    int GetReferenceID(const string& refName) const;\r
+\r
+    // index operations\r
+    bool CreateIndex(bool useDefaultIndex);\r
+\r
+    // -------------------------------\r
+    // internal methods\r
+    // -------------------------------\r
+\r
+    // *** reading alignments and auxiliary data *** //\r
+\r
+    // fills out character data for BamAlignment data\r
+    bool BuildCharData(BamAlignment& bAlignment);\r
+    // checks to see if alignment overlaps current region\r
+    RegionState IsOverlap(BamAlignment& bAlignment);\r
+    // retrieves header text from BAM file\r
+    void LoadHeaderData(void);\r
+    // retrieves BAM alignment under file pointer\r
+    bool LoadNextAlignment(BamAlignment& bAlignment);\r
+    // builds reference data structure from BAM file\r
+    void LoadReferenceData(void);\r
+\r
+    // *** index file handling *** //\r
+\r
+    // clear out inernal index data structure\r
+    void ClearIndex(void);\r
+    // loads index from BAM index file\r
+    bool LoadIndex(void);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamReader implementation (wrapper around BRPrivate)\r
+// -----------------------------------------------------\r
+// constructor\r
+BamReader::BamReader(void) {\r
+    d = new BamReaderPrivate(this);\r
+}\r
+\r
+// destructor\r
+BamReader::~BamReader(void) {\r
+    delete d;\r
+    d = 0;\r
+}\r
+\r
+// file operations\r
+void BamReader::Close(void) { d->Close(); }\r
+bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
+bool BamReader::Jump(int refID, int position) { \r
+    d->Region.LeftRefID = refID;\r
+    d->Region.LeftPosition = position;\r
+    d->IsLeftBoundSpecified = true;\r
+    d->IsRightBoundSpecified = false;\r
+    return d->Jump(refID, position); \r
+}\r
+bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); }\r
+bool BamReader::Rewind(void) { return d->Rewind(); }\r
+bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }\r
+bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {\r
+    return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );\r
+}\r
+\r
+// access alignment data\r
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }\r
+\r
+// access auxiliary data\r
+const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
+const RefVector& BamReader::GetReferenceData(void) const { return d->References; }\r
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
+\r
+// index operations\r
+bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); }\r
+\r
+// -----------------------------------------------------\r
+// BamReaderPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)\r
+    : NewIndex(0)\r
+    , IsIndexLoaded(false)\r
+    , AlignmentsBeginOffset(0)\r
+    , IsLeftBoundSpecified(false)\r
+    , IsRightBoundSpecified(false)\r
+    , IsRegionSpecified(false)\r
+    , CurrentRefID(0)\r
+    , CurrentLeft(0)\r
+    , Parent(parent)\r
+    , DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
+    , CIGAR_LOOKUP("MIDNSHP")\r
+{ \r
+    IsBigEndian = SystemIsBigEndian();\r
+}\r
+\r
+// destructor\r
+BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
+    Close();\r
+}\r
+\r
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
+  \r
+    // calculate character lengths/offsets\r
+    const unsigned int dataLength      = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+    const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
+    const unsigned int seqDataOffset   = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
+    const unsigned int qualDataOffset  = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
+    const unsigned int tagDataOffset   = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
+    const unsigned int tagDataLength   = dataLength - tagDataOffset;\r
+      \r
+    // set up char buffers\r
+    const char*     allCharData = bAlignment.SupportData.AllCharData.data();\r
+          uint32_t* cigarData   = (uint32_t*)(allCharData + cigarDataOffset);\r
+    const char*     seqData     = ((const char*)allCharData) + seqDataOffset;\r
+    const char*     qualData    = ((const char*)allCharData) + qualDataOffset;\r
+          char*     tagData     = ((char*)allCharData) + tagDataOffset;\r
+  \r
+    // store alignment name (depends on null char as terminator)\r
+    bAlignment.Name.assign((const char*)(allCharData));    \r
+          \r
+    // save CigarOps \r
+    CigarOp op;\r
+    bAlignment.CigarData.clear();\r
+    bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
+    for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
+\r
+        // swap if necessary\r
+        if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+      \r
+        // build CigarOp structure\r
+        op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+        op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+        // save CigarOp\r
+        bAlignment.CigarData.push_back(op);\r
+    }\r
+          \r
+          \r
+    // save query sequence\r
+    bAlignment.QueryBases.clear();\r
+    bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+    for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
+        char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
+        bAlignment.QueryBases.append(1, singleBase);\r
+    }\r
+  \r
+    // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
+    bAlignment.Qualities.clear();\r
+    bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+    for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
+        char singleQuality = (char)(qualData[i]+33);\r
+        bAlignment.Qualities.append(1, singleQuality);\r
+    }\r
+    \r
+    // if QueryBases is empty (and this is a allowed case)\r
+    if ( bAlignment.QueryBases.empty() ) \r
+        bAlignment.AlignedBases = bAlignment.QueryBases;\r
+    \r
+    // if QueryBases contains data, then build AlignedBases using CIGAR data\r
+    else {\r
+    \r
+        // resize AlignedBases\r
+        bAlignment.AlignedBases.clear();\r
+        bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+      \r
+        // iterate over CigarOps\r
+        int k = 0;\r
+        vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
+        vector<CigarOp>::const_iterator cigarEnd  = bAlignment.CigarData.end();\r
+        for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
+            \r
+            const CigarOp& op = (*cigarIter);\r
+            switch(op.Type) {\r
+              \r
+                case ('M') :\r
+                case ('I') :\r
+                    bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases\r
+                    // fall through\r
+                \r
+                case ('S') :\r
+                    k += op.Length;                                     // for 'S' - soft clip, skip over query bases\r
+                    break;\r
+                    \r
+                case ('D') :\r
+                    bAlignment.AlignedBases.append(op.Length, '-');     // for 'D' - write gap character\r
+                    break;\r
+                    \r
+                case ('P') :\r
+                    bAlignment.AlignedBases.append( op.Length, '*' );   // for 'P' - write padding character\r
+                    break;\r
+                    \r
+                case ('N') :\r
+                    bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in original query sequence\r
+                    break;\r
+                    \r
+                case ('H') :\r
+                    break;  // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
+                    \r
+                default:\r
+                    printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+                    exit(1);\r
+            }\r
+        }\r
+    }\r
\r
+    // -----------------------\r
+    // Added: 3-25-2010 DB\r
+    // Fixed: endian-correctness for tag data\r
+    // -----------------------\r
+    if ( IsBigEndian ) {\r
+        int i = 0;\r
+        while ( (unsigned int)i < tagDataLength ) {\r
+          \r
+            i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+            uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
+            ++i;                                    // skip value type\r
+    \r
+            switch (type) {\r
+                \r
+                case('A') :\r
+                case('C') : \r
+                    ++i;\r
+                    break;\r
+\r
+                case('S') : \r
+                    SwapEndian_16p(&tagData[i]); \r
+                    i += sizeof(uint16_t);\r
+                    break;\r
+                    \r
+                case('F') :\r
+                case('I') : \r
+                    SwapEndian_32p(&tagData[i]);\r
+                    i += sizeof(uint32_t);\r
+                    break;\r
+                \r
+                case('D') : \r
+                    SwapEndian_64p(&tagData[i]);\r
+                    i += sizeof(uint64_t);\r
+                    break;\r
+                \r
+                case('H') :\r
+                case('Z') : \r
+                    while (tagData[i]) { ++i; }\r
+                    ++i; // increment one more for null terminator\r
+                    break;\r
+                \r
+                default : \r
+                    printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+                    exit(1);\r
+            }\r
+        }\r
+    }\r
+    \r
+    // store TagData\r
+    bAlignment.TagData.clear();\r
+    bAlignment.TagData.resize(tagDataLength);\r
+    memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
+    \r
+    // clear the core-only flag\r
+    bAlignment.SupportData.HasCoreOnly = false;\r
+    \r
+    // return success\r
+    return true;\r
+}\r
+\r
+// clear index data structure\r
+void BamReader::BamReaderPrivate::ClearIndex(void) {\r
+    delete NewIndex;\r
+    NewIndex = 0;\r
+}\r
+\r
+// closes the BAM file\r
+void BamReader::BamReaderPrivate::Close(void) {\r
+    \r
+    // close BGZF file stream\r
+    mBGZF.Close();\r
+    \r
+    // clear out index data\r
+    ClearIndex();\r
+    \r
+    // clear out header data\r
+    HeaderText.clear();\r
+    \r
+    // clear out region flags\r
+    IsLeftBoundSpecified  = false;\r
+    IsRightBoundSpecified = false;\r
+    IsRegionSpecified     = false;\r
+}\r
+\r
+// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
+bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) {\r
+\r
+    // clear out prior index data\r
+    ClearIndex();\r
+    \r
+    // create default index\r
+    if ( useDefaultIndex )\r
+        NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+    // create BamTools 'custom' index\r
+    else\r
+        NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+    \r
+    bool ok = true;\r
+    ok &= NewIndex->Build();\r
+    ok &= NewIndex->Write(Filename); \r
+    \r
+    // return success/fail\r
+    return ok;\r
+}\r
+\r
+// get next alignment (from specified region, if given)\r
+bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
+\r
+    // if valid alignment found, attempt to parse char data, and return success/failure\r
+    if ( GetNextAlignmentCore(bAlignment) )\r
+        return BuildCharData(bAlignment);\r
+    \r
+    // no valid alignment found\r
+    else\r
+        return false;\r
+}\r
+\r
+// retrieves next available alignment core data (returns success/fail)\r
+// ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
+//    these can be accessed, if necessary, from the supportData \r
+// useful for operations requiring ONLY positional or other alignment-related information\r
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {\r
+\r
+    // if valid alignment available\r
+    if ( LoadNextAlignment(bAlignment) ) {\r
+\r
+        // set core-only flag\r
+        bAlignment.SupportData.HasCoreOnly = true;\r
+      \r
+        // if region not specified, return success\r
+        if ( !IsLeftBoundSpecified ) return true;\r
+\r
+        // determine region state (before, within, after)\r
+        BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);\r
+      \r
+        // if alignment lies after region, return false\r
+        if ( state == AFTER_REGION ) \r
+            return false;\r
+\r
+        while ( state != WITHIN_REGION ) {\r
+            // if no valid alignment available (likely EOF) return failure\r
+            if ( !LoadNextAlignment(bAlignment) ) return false;\r
+            // if alignment lies after region, return false (no available read within region)\r
+            state = IsOverlap(bAlignment);\r
+            if ( state == AFTER_REGION) return false;\r
+            \r
+        }\r
+\r
+        // return success (alignment found that overlaps region)\r
+        return true;\r
+    }\r
+\r
+    // no valid alignment\r
+    else\r
+        return false;\r
+}\r
+\r
+// returns RefID for given RefName (returns References.size() if not found)\r
+int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
+\r
+    // retrieve names from reference data\r
+    vector<string> refNames;\r
+    RefVector::const_iterator refIter = References.begin();\r
+    RefVector::const_iterator refEnd  = References.end();\r
+    for ( ; refIter != refEnd; ++refIter) {\r
+        refNames.push_back( (*refIter).RefName );\r
+    }\r
+\r
+    // return 'index-of' refName ( if not found, returns refNames.size() )\r
+    return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
+}\r
+\r
+// returns region state - whether alignment ends before, overlaps, or starts after currently specified region\r
+// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true\r
+BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
+    \r
+    // --------------------------------------------------\r
+    // check alignment start against right bound cutoff\r
+    \r
+    // if full region of interest was given\r
+    if ( IsRightBoundSpecified ) {\r
+      \r
+        // read starts on right bound reference, but AFTER right bound position\r
+        if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition )\r
+            return AFTER_REGION;\r
+      \r
+        // if read starts on reference AFTER right bound, return false\r
+        if ( bAlignment.RefID > Region.RightRefID ) \r
+            return AFTER_REGION;\r
+    }\r
+  \r
+    // --------------------------------------------------------\r
+    // no right bound given OR read starts before right bound\r
+    // so, check if it overlaps left bound \r
+  \r
+    // if read starts on left bound reference AND after left boundary, return success\r
+    if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition)\r
+        return WITHIN_REGION;\r
+  \r
+    // if read is on any reference sequence before left bound, return false\r
+    if ( bAlignment.RefID < Region.LeftRefID )\r
+        return BEFORE_REGION;\r
+\r
+    // --------------------------------------------------------\r
+    // read is on left bound reference, but starts before left bound position\r
+\r
+    // if it overlaps, return WITHIN_REGION\r
+    if ( bAlignment.GetEndPosition() >= Region.LeftPosition )\r
+        return WITHIN_REGION;\r
+    // else begins before left bound position\r
+    else\r
+        return BEFORE_REGION;\r
+}\r
+\r
+// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
+bool BamReader::BamReaderPrivate::Jump(int refID, int position) {\r
+\r
+    // -----------------------------------------------------------------------\r
+    // check for existing index \r
+    if ( NewIndex == 0 ) return false; \r
+    // see if reference has alignments\r
+    if ( !NewIndex->HasAlignments(refID) ) return false; \r
+    // make sure position is valid\r
+    if ( position > References.at(refID).RefLength ) return false;\r
+    \r
+    // determine possible offsets\r
+    vector<int64_t> offsets;\r
+    if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) {\r
+        printf("ERROR: Could not jump: unable to calculate offset for specified region.\n");\r
+        return false;\r
+    }\r
+      \r
+    // iterate through offsets\r
+    BamAlignment bAlignment;\r
+    bool result = true;\r
+    for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {\r
+        \r
+        // attempt seek & load first available alignment\r
+        result &= mBGZF.Seek(*o);\r
+        LoadNextAlignment(bAlignment);\r
+        \r
+        // if this alignment corresponds to desired position\r
+        // return success of seeking back to 'current offset'\r
+        if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) {\r
+            if ( o != offsets.begin() ) --o;\r
+            return mBGZF.Seek(*o);\r
+        }\r
+    }\r
+    \r
+    return result;\r
+}\r
+\r
+// load BAM header data\r
+void BamReader::BamReaderPrivate::LoadHeaderData(void) {\r
+\r
+    // check to see if proper BAM header\r
+    char buffer[4];\r
+    if (mBGZF.Read(buffer, 4) != 4) {\r
+        printf("Could not read header type\n");\r
+        exit(1);\r
+    }\r
+\r
+    if (strncmp(buffer, "BAM\001", 4)) {\r
+        printf("wrong header type!\n");\r
+        exit(1);\r
+    }\r
+\r
+    // get BAM header text length\r
+    mBGZF.Read(buffer, 4);\r
+    unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
+    if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+    \r
+    // get BAM header text\r
+    char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
+    mBGZF.Read(headerText, headerTextLength);\r
+    HeaderText = (string)((const char*)headerText);\r
+\r
+    // clean up calloc-ed temp variable\r
+    free(headerText);\r
+}\r
+\r
+// load existing index data from BAM index file (".bai"), return success/fail\r
+bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
+\r
+    // clear out any existing index data\r
+    ClearIndex();\r
+\r
+    // skip if index file empty\r
+    if ( IndexFilename.empty() )\r
+        return false;\r
+\r
+    // check supplied filename for index type\r
+    size_t defaultExtensionFound = IndexFilename.find(".bai");\r
+    size_t customExtensionFound  = IndexFilename.find(".bti");\r
+    \r
+    // if SAM/BAM default (".bai")\r
+    if ( defaultExtensionFound != string::npos )\r
+        NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+    \r
+    // if BamTools custom index (".bti")\r
+    else if ( customExtensionFound != string::npos )\r
+        NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+    \r
+    // else unknown\r
+    else {\r
+        printf("ERROR: Unknown index file extension.\n");\r
+        return false;\r
+    }\r
+    \r
+    // return success of loading index data\r
+    return NewIndex->Load(IndexFilename);\r
+}\r
+\r
+// populates BamAlignment with alignment data under file pointer, returns success/fail\r
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
+\r
+    // read in the 'block length' value, make sure it's not zero\r
+    char buffer[4];\r
+    mBGZF.Read(buffer, 4);\r
+    bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+    if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
+    if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
+\r
+    // read in core alignment data, make sure the right size of data was read\r
+    char x[BAM_CORE_SIZE];\r
+    if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+\r
+    if ( IsBigEndian ) {\r
+        for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
+          SwapEndian_32p(&x[i]); \r
+        }\r
+    }\r
+    \r
+    // set BamAlignment 'core' and 'support' data\r
+    bAlignment.RefID    = BgzfData::UnpackSignedInt(&x[0]);  \r
+    bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
+    \r
+    unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
+    bAlignment.Bin        = tempValue >> 16;\r
+    bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
+    bAlignment.SupportData.QueryNameLength = tempValue & 0xff;\r
+\r
+    tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
+    bAlignment.AlignmentFlag = tempValue >> 16;\r
+    bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;\r
+\r
+    bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+    bAlignment.MateRefID    = BgzfData::UnpackSignedInt(&x[20]);\r
+    bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
+    bAlignment.InsertSize   = BgzfData::UnpackSignedInt(&x[28]);\r
+    \r
+    // set BamAlignment length\r
+    bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
+    \r
+    // read in character data - make sure proper data size was read\r
+    bool readCharDataOK = false;\r
+    const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+    char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
+    \r
+    if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { \r
+      \r
+        // store 'allCharData' in supportData structure\r
+        bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
+        \r
+        // set success flag\r
+        readCharDataOK = true;\r
+    }\r
+\r
+    free(allCharData);\r
+    return readCharDataOK;\r
+}\r
+\r
+// loads reference data from BAM file\r
+void BamReader::BamReaderPrivate::LoadReferenceData(void) {\r
+\r
+    // get number of reference sequences\r
+    char buffer[4];\r
+    mBGZF.Read(buffer, 4);\r
+    unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+    if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
+    if (numberRefSeqs == 0) { return; }\r
+    References.reserve((int)numberRefSeqs);\r
+\r
+    // iterate over all references in header\r
+    for (unsigned int i = 0; i != numberRefSeqs; ++i) {\r
+\r
+        // get length of reference name\r
+        mBGZF.Read(buffer, 4);\r
+        unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+        if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
+        char* refName = (char*)calloc(refNameLength, 1);\r
+\r
+        // get reference name and reference sequence length\r
+        mBGZF.Read(refName, refNameLength);\r
+        mBGZF.Read(buffer, 4);\r
+        int refLength = BgzfData::UnpackSignedInt(buffer);\r
+        if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
+\r
+        // store data for reference\r
+        RefData aReference;\r
+        aReference.RefName   = (string)((const char*)refName);\r
+        aReference.RefLength = refLength;\r
+        References.push_back(aReference);\r
+\r
+        // clean up calloc-ed temp variable\r
+        free(refName);\r
+    }\r
+}\r
+\r
+// opens BAM file (and index)\r
+bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
+\r
+    Filename = filename;\r
+    IndexFilename = indexFilename;\r
+\r
+    // open the BGZF file for reading, return false on failure\r
+    if ( !mBGZF.Open(filename, "rb") ) \r
+        return false;\r
+    \r
+    // retrieve header text & reference data\r
+    LoadHeaderData();\r
+    LoadReferenceData();\r
+\r
+    // store file offset of first alignment\r
+    AlignmentsBeginOffset = mBGZF.Tell();\r
+\r
+    // open index file & load index data (if exists)\r
+    if ( !IndexFilename.empty() )\r
+        LoadIndex();\r
+    \r
+    // return success\r
+    return true;\r
+}\r
+\r
+// returns BAM file pointer to beginning of alignment data\r
+bool BamReader::BamReaderPrivate::Rewind(void) {\r
+   \r
+    // rewind to first alignment\r
+    if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;\r
+  \r
+    // retrieve first alignment data\r
+    BamAlignment al;\r
+    if ( !LoadNextAlignment(al) ) return false;\r
+      \r
+    // reset default region info using first alignment in file\r
+    Region.LeftRefID      = al.RefID;\r
+    Region.LeftPosition   = al.Position;\r
+    Region.RightRefID     = -1;\r
+    Region.RightPosition  = -1;\r
+    IsLeftBoundSpecified  = false;\r
+    IsRightBoundSpecified = false; \r
+\r
+    // rewind back to before first alignment\r
+    // return success/fail of seek\r
+    return mBGZF.Seek(AlignmentsBeginOffset);\r
+}\r
+\r
+// sets a region of interest (with left & right bound reference/position)\r
+// attempts a Jump() to left bound as well\r
+// returns success/failure of Jump()\r
+bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {\r
+    \r
+    // save region of interest\r
+    Region = region;\r
+    \r
+    // set flags\r
+    if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) \r
+        IsLeftBoundSpecified = true;\r
+    if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) \r
+        IsRightBoundSpecified = true;\r
+    \r
+    // attempt jump to beginning of region, return success/fail of Jump()\r
+    return Jump( Region.LeftRefID, Region.LeftPosition );\r
+}\r