]> git.donarmstrong.com Git - bamtools.git/commitdiff
Added tag data support.
authormikaels <mikaels@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Sat, 11 Apr 2009 03:32:53 +0000 (03:32 +0000)
committermikaels <mikaels@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Sat, 11 Apr 2009 03:32:53 +0000 (03:32 +0000)
git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@11 9efb377e-2e27-44b9-b91a-ec4abb80ed8b

BamReader.cpp
BamWriter.cpp

index 8fba1a920f7ebb81d62c07cac1168bdaf8627330..4b50d7e2348c9a59b2b33cd3e247b059f6c0f614 100644 (file)
-// BamReader.cpp\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 6 April 2009\r
-\r
-#include "BamReader.h"\r
-#include <iostream>\r
-using std::cerr;\r
-using std::endl;\r
-\r
-// static character constants\r
-const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";\r
-const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";\r
-\r
-BamReader::BamReader(const char* filename, const char* indexFilename) \r
-       : m_filename( (char*)filename )\r
-       , m_indexFilename( (char*)indexFilename )\r
-       , m_file(0)\r
-       , m_index(NULL)\r
-       , m_headerText("")\r
-       , m_isOpen(false)\r
-       , m_isIndexLoaded(false)\r
-       , m_isRegionSpecified(false)\r
-       , m_isCalculateAlignedBases(true)\r
-       , m_currentRefID(0)\r
-       , m_currentLeft(0)\r
-       , m_alignmentsBeginOffset(0)\r
-{\r
-       Open();\r
-}\r
-\r
-BamReader::~BamReader(void) {\r
-\r
-       // close file\r
-       if ( m_isOpen ) { Close(); }\r
-}\r
-\r
-// open BAM file\r
-bool BamReader::Open(void) {\r
-\r
-       if (!m_isOpen && m_filename != NULL ) {\r
-\r
-               // open file\r
-               m_file = bam_open(m_filename, "r"); \r
-               \r
-               // get header info && index info\r
-               if ( (m_file != NULL) && LoadHeader() ) {\r
-\r
-                       // save file offset where alignments start\r
-                       m_alignmentsBeginOffset = bam_tell(m_file);\r
-                       \r
-                       // set open flag\r
-                       m_isOpen = true;\r
-               }\r
-\r
-               // try to open (and load) index data, if index file given\r
-               if ( m_indexFilename != NULL ) {\r
-                       OpenIndex();\r
-               }\r
-       }\r
-\r
-       return m_isOpen;\r
-}\r
-\r
-bool BamReader::OpenIndex(void) {\r
-\r
-       if ( m_indexFilename && !m_isIndexLoaded ) {\r
-               m_isIndexLoaded = LoadIndex();\r
-       }\r
-       return m_isIndexLoaded;\r
-}\r
-\r
-// close BAM file\r
-bool BamReader::Close(void) {\r
-       \r
-       if (m_isOpen) {\r
-\r
-               // close file\r
-               int ret = bam_close(m_file);\r
-               \r
-               // delete index info\r
-               if ( m_index != NULL) { delete m_index; }\r
-\r
-               // clear open flag\r
-               m_isOpen = false;\r
-\r
-               // clear index flag\r
-               m_isIndexLoaded = false;\r
-\r
-               // clear region flag\r
-               m_isRegionSpecified = false;\r
-\r
-               // return success/fail of bam_close\r
-               return (ret == 0);\r
-       } \r
-\r
-       return true;\r
-}\r
-\r
-// get BAM filename\r
-const char* BamReader::Filename(void) const { \r
-       return (const char*)m_filename; \r
-}\r
-\r
-// set BAM filename\r
-void BamReader::SetFilename(const char* filename) {\r
-       m_filename = (char*)filename;\r
-}\r
-\r
-// get BAM Index filename\r
-const char* BamReader::IndexFilename(void) const { \r
-       return (const char*)m_indexFilename; \r
-}\r
-\r
-// set BAM Index filename\r
-void BamReader::SetIndexFilename(const char* indexFilename) {\r
-       m_indexFilename = (char*)indexFilename;\r
-}\r
-\r
-// return full header text\r
-const string BamReader::GetHeaderText(void) const { \r
-       return m_headerText; \r
-}\r
-\r
-// return number of reference sequences in BAM file\r
-const int BamReader::GetReferenceCount(void) const { \r
-       return m_references.size();\r
-}\r
-\r
-// get RefID from reference name\r
-const int BamReader::GetRefID(string refName) const { \r
-       \r
-       vector<string> refNames;\r
-       RefVector::const_iterator refIter = m_references.begin();\r
-    RefVector::const_iterator refEnd  = m_references.end();\r
-    for ( ; refIter != refEnd; ++refIter) {\r
-               refNames.push_back( (*refIter).RefName );\r
-    }\r
-\r
-       // return 'index-of' refName (if not found, returns refNames.size())\r
-       return Index( refNames.begin(), refNames.end(), refName );\r
-}\r
-\r
-const RefVector BamReader::GetReferenceData(void) const {\r
-       return m_references;\r
-}\r
-\r
-bool BamReader::Jump(int refID, unsigned int left) {\r
-\r
-       // if index available, and region is valid\r
-       if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { \r
-               m_currentRefID = refID;\r
-               m_currentLeft  = left;\r
-               m_isRegionSpecified = true;\r
-               return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );\r
-       }\r
-       return false;\r
-}\r
-\r
-bool BamReader::Rewind(void) {\r
-\r
-       int refID = 0;\r
-       int refCount = m_references.size();\r
-       for ( ; refID < refCount; ++refID ) {\r
-               if ( m_references.at(refID).RefHasAlignments ) { break; } \r
-       }\r
-\r
-       m_currentRefID = refID;\r
-       m_currentLeft = 0;\r
-       m_isRegionSpecified = false;\r
-\r
-       return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );\r
-}      \r
-\r
-// get next alignment from specified region\r
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {\r
-\r
-       // try to load 'next' read\r
-       if ( LoadNextAlignment(bAlignment) ) {\r
-\r
-               // if specified region, check for region overlap\r
-               if ( m_isRegionSpecified ) {\r
-\r
-                       // if overlap, return true\r
-                       if ( IsOverlap(bAlignment) ) { return true; }\r
-                       // if not, try the next alignment\r
-                       else { return GetNextAlignment(bAlignment); }\r
-               } \r
-\r
-               // not using region, valid read detected, return success\r
-               else { return true; }\r
-       }\r
-\r
-       // no valid alignment to load\r
-       return false;\r
-}\r
-\r
-void BamReader::SetCalculateAlignedBases(bool flag) {\r
-       m_isCalculateAlignedBases = flag;\r
-}\r
-\r
-int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {\r
-\r
-       // get region boundaries\r
-       uint32_t begin = left;\r
-       uint32_t end   = m_references.at(refID).RefLength - 1;\r
-\r
-       // initialize list, bin '0' always a valid bin\r
-       int i = 0;\r
-       list[i++] = 0;\r
-\r
-       // get rest of bins that contain this region\r
-       unsigned int k;\r
-       for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }\r
-       for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }\r
-       for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }\r
-       for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }\r
-       for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
-       \r
-       // return number of bins stored\r
-       return i;\r
-}\r
-\r
-uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {\r
-\r
-       // initialize alignment end to starting position\r
-       uint32_t alignEnd = position;\r
-\r
-       // iterate over cigar operations\r
-       vector<CigarOp>::const_iterator cigarIter = cigarData.begin();\r
-       vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();\r
-       for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
-               if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {\r
-                       alignEnd += (*cigarIter).Length;\r
-               }\r
-       }\r
-       return alignEnd;\r
-}\r
-\r
-uint64_t BamReader::GetOffset(int refID, unsigned int left) {\r
-\r
-       //  make space for bins\r
-       uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);                 \r
-       \r
-       // returns number of bins overlapping (left, right)\r
-       // stores indices of those bins in 'bins'\r
-       int numBins = BinsFromRegion(refID, left, bins);                                \r
-\r
-       // access index data for refID\r
-       RefIndex* refIndex = m_index->at(refID);\r
-\r
-       // get list of BAM bins for this reference sequence\r
-       BinVector* refBins = refIndex->first;\r
-\r
-       sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );\r
-\r
-       // declare ChunkVector\r
-       ChunkVector regionChunks;\r
-\r
-       // declaure LinearOffsetVector\r
-       LinearOffsetVector* linearOffsets = refIndex->second;\r
-\r
-       // some sort of linear offset vs bin index voodoo... not completely sure what's going here\r
-       uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);\r
-\r
-       BinVector::iterator binBegin = refBins->begin();\r
-       BinVector::iterator binEnd   = refBins->end();\r
-\r
-       // iterate over bins overlapping region, count chunks\r
-       for (int i = 0; i < numBins; ++i) {\r
-               \r
-               // look for bin with ID=bin[i]\r
-               BinVector::iterator binIter = binBegin;\r
-\r
-               for ( ; binIter != binEnd; ++binIter ) {\r
-               \r
-                       // if found, increment n_off by number of chunks for each bin\r
-                       if ( (*binIter).first == (uint32_t)bins[i] ) { \r
-                               \r
-                               // iterate over chunks in that bin\r
-                               ChunkVector* binChunks = (*binIter).second;\r
-                               \r
-                               ChunkVector::iterator chunkIter = binChunks->begin();\r
-                               ChunkVector::iterator chunkEnd  = binChunks->end();\r
-                               for ( ; chunkIter != chunkEnd; ++chunkIter) {\r
-                               \r
-                                       // if right bound of pair is greater than min_off (linear offset value), store pair\r
-                                       if ( (*chunkIter).second > minOffset) { \r
-                                               regionChunks.push_back( (*chunkIter) ); \r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-       }\r
-\r
-       // clean up temp array\r
-       free(bins);\r
-\r
-       // there should be at least 1\r
-       assert(regionChunks.size() > 0);\r
-\r
-       // sort chunks by start position\r
-       sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );\r
-\r
-       // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing\r
-       int numOffsets = regionChunks.size();   \r
-       for (int i = 1; i < numOffsets; ++i) {\r
-               if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {\r
-                       regionChunks.at(i-1).second = regionChunks.at(i).first;\r
-               }\r
-       }\r
-       \r
-       // merge adjacent chunks\r
-       int l = 0;\r
-       for (int i = 1; i < numOffsets; ++i) {\r
-               // if adjacent, expand boundaries of (merged) chunk\r
-               if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {\r
-                       regionChunks.at(l).second = regionChunks.at(i).second;\r
-               }\r
-               // else, move on to next (merged) chunk index\r
-               else { regionChunks.at(++l) = regionChunks.at(i); }\r
-       }\r
-\r
-       // return beginning file offset of first chunk for region\r
-       return regionChunks.at(0).first;\r
-}\r
-\r
-bool BamReader::IsOverlap(BamAlignment& bAlignment) {\r
-\r
-       // if on different reference sequence, quit\r
-       if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }\r
-\r
-       // read starts after left boundary\r
-       if ( bAlignment.Position >= m_currentLeft) { return true; }\r
-\r
-       // get alignment end\r
-       uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);\r
-\r
-       // return whether alignment end overlaps left boundary\r
-       return ( alignEnd >= m_currentLeft );\r
-}\r
-\r
-bool BamReader::LoadHeader(void) {\r
-\r
-       // check to see if proper BAM header\r
-       char buf[4];\r
-       if (bam_read(m_file, buf, 4) != 4) { return false; }\r
-       if (strncmp(buf, "BAM\001", 4)) {\r
-               fprintf(stderr, "wrong header type!\n");\r
-               return false;\r
-       }\r
-       \r
-       // get BAM header text length\r
-       int32_t headerTextLength;\r
-       bam_read(m_file, &headerTextLength, 4);\r
-\r
-       // get BAM header text\r
-       char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
-       bam_read(m_file, headerText, headerTextLength);\r
-       m_headerText = (string)((const char*)headerText);\r
-       \r
-       // clean up calloc-ed temp variable\r
-       free(headerText);\r
-\r
-       // get number of reference sequences\r
-       int32_t numberRefSeqs;\r
-       bam_read(m_file, &numberRefSeqs, 4);\r
-       if (numberRefSeqs == 0) { return false; }\r
-\r
-       m_references.reserve((int)numberRefSeqs);\r
-       \r
-       // reference variables\r
-       int32_t  refNameLength;\r
-       char*    refName;\r
-       uint32_t refLength;\r
-\r
-       // iterate over all references in header\r
-       for (int i = 0; i != numberRefSeqs; ++i) {\r
-\r
-               // get length of reference name\r
-               bam_read(m_file, &refNameLength, 4);\r
-               refName = (char*)calloc(refNameLength, 1);\r
-\r
-               // get reference name and reference sequence length\r
-               bam_read(m_file, refName, refNameLength);\r
-               bam_read(m_file, &refLength, 4);\r
-\r
-               // store data for reference\r
-               RefData aReference;\r
-               aReference.RefName   = (string)((const char*)refName);\r
-               aReference.RefLength = refLength;\r
-               m_references.push_back(aReference);\r
-\r
-               // clean up calloc-ed temp variable\r
-               free(refName);\r
-       }\r
-       \r
-       return true;\r
-}\r
-\r
-bool BamReader::LoadIndex(void) {\r
-\r
-       // check to see if index file exists\r
-       FILE* indexFile;\r
-       if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {\r
-               fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");\r
-               return false;\r
-       }\r
-\r
-       // see if index is valid BAM index\r
-       char magic[4];\r
-       fread(magic, 1, 4, indexFile);\r
-       if (strncmp(magic, "BAI\1", 4)) {\r
-               fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");\r
-               fclose(indexFile);\r
-               return false;\r
-       }\r
-\r
-       // get number of reference sequences\r
-       uint32_t numRefSeqs;\r
-       fread(&numRefSeqs, 4, 1, indexFile);\r
-       \r
-       // intialize BamIndex data structure\r
-       m_index = new BamIndex;\r
-       m_index->reserve(numRefSeqs);\r
-\r
-       // iterate over reference sequences\r
-       for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
-               \r
-               // get number of bins for this reference sequence\r
-               int32_t numBins;\r
-               fread(&numBins, 4, 1, indexFile);\r
-               \r
-               if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }\r
-\r
-               // intialize BinVector\r
-               BinVector* bins = new BinVector;\r
-               bins->reserve(numBins);\r
-               \r
-               // iterate over bins for that reference sequence\r
-               for (int j = 0; j < numBins; ++j) {\r
-                       \r
-                       // get binID \r
-                       uint32_t binID;\r
-                       fread(&binID, 4, 1, indexFile);\r
-                       \r
-                       // get number of regionChunks in this bin\r
-                       uint32_t numChunks;\r
-                       fread(&numChunks, 4, 1, indexFile);\r
-                       \r
-                       // intialize ChunkVector\r
-                       ChunkVector* regionChunks = new ChunkVector;\r
-                       regionChunks->reserve(numChunks);\r
-                       \r
-                       // iterate over regionChunks in this bin\r
-                       for (unsigned int k = 0; k < numChunks; ++k) {\r
-                               \r
-                               // get chunk boundaries (left, right) \r
-                               uint64_t left;\r
-                               uint64_t right;\r
-                               fread(&left, 8, 1, indexFile);\r
-                               fread(&right, 8, 1, indexFile);\r
-                               \r
-                               // save ChunkPair\r
-                               regionChunks->push_back( ChunkPair(left, right) );\r
-                       }\r
-                       \r
-                       // save binID, chunkVector for this bin\r
-                       bins->push_back( BamBin(binID, regionChunks) );\r
-               }\r
-               \r
-               // load linear index for this reference sequence\r
-               \r
-               // get number of linear offsets\r
-               int32_t numLinearOffsets;\r
-               fread(&numLinearOffsets, 4, 1, indexFile);\r
-               \r
-               // intialize LinearOffsetVector\r
-               LinearOffsetVector* linearOffsets = new LinearOffsetVector;\r
-               linearOffsets->reserve(numLinearOffsets);\r
-               \r
-               // iterate over linear offsets for this reference sequeence\r
-               for (int j = 0; j < numLinearOffsets; ++j) {\r
-                       // get a linear offset\r
-                       uint64_t linearOffset;\r
-                       fread(&linearOffset, 8, 1, indexFile);\r
-                       // store linear offset\r
-                       linearOffsets->push_back(linearOffset);\r
-               }\r
-               \r
-               // store index data for that reference sequence\r
-               m_index->push_back( new RefIndex(bins, linearOffsets) );\r
-       }\r
-       \r
-       // close index file (.bai) and return\r
-       fclose(indexFile);\r
-       return true;\r
-}\r
-\r
-bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {\r
-\r
-       // check valid alignment block header data\r
-       int32_t block_len;\r
-       int32_t ret;\r
-       uint32_t x[8];\r
-\r
-       int32_t bytesRead = 0;\r
-\r
-       // read in the 'block length' value, make sure it's not zero\r
-       if ( (ret = bam_read(m_file, &block_len, 4)) == 0 )        { return false; }\r
-       bytesRead += 4;\r
-\r
-       // read in core alignment data, make the right size of data was read \r
-       if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
-       bytesRead += BAM_CORE_SIZE;\r
-\r
-       // set BamAlignment 'core' data\r
-       bAlignment.RefID         = x[0]; \r
-       bAlignment.Position      = x[1];\r
-       bAlignment.Bin           = x[2]>>16; \r
-       bAlignment.MapQuality    = x[2]>>8&0xff; \r
-       bAlignment.AlignmentFlag = x[3]>>16; \r
-       bAlignment.MateRefID     = x[5]; \r
-       bAlignment.MatePosition  = x[6]; \r
-       bAlignment.InsertSize    = x[7];\r
-\r
-       // fetch & store often-used lengths for character data parsing\r
-       unsigned int queryNameLength     = x[2]&0xff;\r
-       unsigned int numCigarOperations  = x[3]&0xffff;\r
-       unsigned int querySequenceLength = x[4];\r
-       \r
-       // get length of character data\r
-       int dataLength = block_len - BAM_CORE_SIZE;\r
-\r
-       // set up destination buffers for character data\r
-       uint8_t*  allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);\r
-       uint32_t* cigarData   = (uint32_t*)(allCharData+queryNameLength);\r
-       \r
-       // get character data - make sure proper data size was read\r
-       if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }\r
-       else {\r
-\r
-               bytesRead += dataLength;\r
-\r
-               // clear out bases, qualities, aligned bases, and CIGAR\r
-               bAlignment.QueryBases.clear();\r
-               bAlignment.Qualities.clear();\r
-               bAlignment.AlignedBases.clear();\r
-               bAlignment.CigarData.clear();\r
-\r
-               // save name\r
-               bAlignment.Name = (string)((const char*)(allCharData));\r
-               \r
-               // save bases\r
-               char singleBase[2];\r
-               uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);\r
-               for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
-                       // numeric to char conversion\r
-                       sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );\r
-                       // append character data to Bases\r
-                       bAlignment.QueryBases.append( (const char*)singleBase );\r
-               }\r
-               \r
-               // save sequence length\r
-               bAlignment.Length = bAlignment.QueryBases.length();\r
-               \r
-               // save qualities\r
-               char singleQuality[4];\r
-               uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);\r
-               for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
-                       // numeric to char conversion\r
-                       sprintf( singleQuality, "%c", ( t[i]+33 ) ); \r
-                       // append character data to Qualities\r
-                       bAlignment.Qualities.append( (const char*)singleQuality );\r
-               }\r
-               \r
-               // save CIGAR-related data;\r
-               int k = 0;\r
-               for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
-\r
-                       // build CigarOp struct\r
-                       CigarOp op;\r
-                       op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
-                       op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
-\r
-                       // save CigarOp\r
-                       bAlignment.CigarData.push_back(op);\r
-\r
-                       // can skip this step if user wants to ignore\r
-                       if (m_isCalculateAlignedBases) {\r
-\r
-                               // build AlignedBases string\r
-                               switch (op.Type) {\r
-                                       \r
-                                       case ('M') : \r
-                                       case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases\r
-                                       case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases\r
-                                                                break;\r
-                                       \r
-                                       case ('D') : \r
-                                       case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;\r
-                                                                break;\r
-                                       \r
-                                       case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence\r
-                                                                k += op.Length;\r
-                                                                break;\r
-\r
-                                       case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op\r
-                                       \r
-                                       default    : assert(false);     // shouldn't get here\r
-                                                                break;\r
-                               }\r
-                       }\r
-               }       \r
-       }\r
-       free(allCharData);\r
-\r
-/*\r
-       // (optional) read tag parsing data\r
-       string tag;\r
-       char data;\r
-       int i = 0;\r
-\r
-       // still data to read (auxiliary tags)\r
-       while ( bytesRead < block_len ) {\r
-\r
-               if ( bam_read(m_file, &data, 1) == 1 ) { \r
-                       \r
-                       ++bytesRead;\r
-\r
-                       if (bytesRead == block_len && data != '\0') {\r
-                               fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");\r
-                               exit(1);\r
-                       }\r
-\r
-                       tag.append(1, data);\r
-                       if ( data == '\0' ) {\r
-                               bAlignment.Tags.push_back(tag);\r
-                               tag = "";\r
-                               i = 0;\r
-                       } else {\r
-                               if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }\r
-                               ++i;\r
-                       }\r
-               } else {\r
-                       fprintf(stderr, "ERROR: Could not read tag info.\n");\r
-                       exit(1);\r
-               }\r
-       }\r
-*/\r
-       return true;\r
-}\r
+// BamReader.cpp
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 6 April 2009
+
+#include "BamReader.h"
+#include <iostream>
+using std::cerr;
+using std::endl;
+
+// static character constants
+const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
+const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
+
+BamReader::BamReader(const char* filename, const char* indexFilename) 
+       : m_filename( (char*)filename )
+       , m_indexFilename( (char*)indexFilename )
+       , m_file(0)
+       , m_index(NULL)
+       , m_headerText("")
+       , m_isOpen(false)
+       , m_isIndexLoaded(false)
+       , m_isRegionSpecified(false)
+       , m_isCalculateAlignedBases(true)
+       , m_currentRefID(0)
+       , m_currentLeft(0)
+       , m_alignmentsBeginOffset(0)
+{
+       Open();
+}
+
+BamReader::~BamReader(void) {
+
+       // close file
+       if ( m_isOpen ) { Close(); }
+}
+
+// open BAM file
+bool BamReader::Open(void) {
+
+       if (!m_isOpen && m_filename != NULL ) {
+
+               // open file
+               m_file = bam_open(m_filename, "r"); 
+               
+               // get header info && index info
+               if ( (m_file != NULL) && LoadHeader() ) {
+
+                       // save file offset where alignments start
+                       m_alignmentsBeginOffset = bam_tell(m_file);
+                       
+                       // set open flag
+                       m_isOpen = true;
+               }
+
+               // try to open (and load) index data, if index file given
+               if ( m_indexFilename != NULL ) {
+                       OpenIndex();
+               }
+       }
+
+       return m_isOpen;
+}
+
+bool BamReader::OpenIndex(void) {
+
+       if ( m_indexFilename && !m_isIndexLoaded ) {
+               m_isIndexLoaded = LoadIndex();
+       }
+       return m_isIndexLoaded;
+}
+
+// close BAM file
+bool BamReader::Close(void) {
+       
+       if (m_isOpen) {
+
+               // close file
+               int ret = bam_close(m_file);
+               
+               // delete index info
+               if ( m_index != NULL) { delete m_index; }
+
+               // clear open flag
+               m_isOpen = false;
+
+               // clear index flag
+               m_isIndexLoaded = false;
+
+               // clear region flag
+               m_isRegionSpecified = false;
+
+               // return success/fail of bam_close
+               return (ret == 0);
+       } 
+
+       return true;
+}
+
+// get BAM filename
+const char* BamReader::Filename(void) const { 
+       return (const char*)m_filename; 
+}
+
+// set BAM filename
+void BamReader::SetFilename(const char* filename) {
+       m_filename = (char*)filename;
+}
+
+// get BAM Index filename
+const char* BamReader::IndexFilename(void) const { 
+       return (const char*)m_indexFilename; 
+}
+
+// set BAM Index filename
+void BamReader::SetIndexFilename(const char* indexFilename) {
+       m_indexFilename = (char*)indexFilename;
+}
+
+// return full header text
+const string BamReader::GetHeaderText(void) const { 
+       return m_headerText; 
+}
+
+// return number of reference sequences in BAM file
+const int BamReader::GetReferenceCount(void) const { 
+       return m_references.size();
+}
+
+// get RefID from reference name
+const int BamReader::GetRefID(string refName) const { 
+       
+       vector<string> refNames;
+       RefVector::const_iterator refIter = m_references.begin();
+    RefVector::const_iterator refEnd  = m_references.end();
+    for ( ; refIter != refEnd; ++refIter) {
+               refNames.push_back( (*refIter).RefName );
+    }
+
+       // return 'index-of' refName (if not found, returns refNames.size())
+       return Index( refNames.begin(), refNames.end(), refName );
+}
+
+const RefVector BamReader::GetReferenceData(void) const {
+       return m_references;
+}
+
+bool BamReader::Jump(int refID, unsigned int left) {
+
+       // if index available, and region is valid
+       if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
+               m_currentRefID = refID;
+               m_currentLeft  = left;
+               m_isRegionSpecified = true;
+               return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );
+       }
+       return false;
+}
+
+bool BamReader::Rewind(void) {
+
+       int refID = 0;
+       int refCount = m_references.size();
+       for ( ; refID < refCount; ++refID ) {
+               if ( m_references.at(refID).RefHasAlignments ) { break; } 
+       }
+
+       m_currentRefID = refID;
+       m_currentLeft = 0;
+       m_isRegionSpecified = false;
+
+       return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
+}      
+
+// get next alignment from specified region
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
+
+       // try to load 'next' read
+       if ( LoadNextAlignment(bAlignment) ) {
+
+               // if specified region, check for region overlap
+               if ( m_isRegionSpecified ) {
+
+                       // if overlap, return true
+                       if ( IsOverlap(bAlignment) ) { return true; }
+                       // if not, try the next alignment
+                       else { return GetNextAlignment(bAlignment); }
+               } 
+
+               // not using region, valid read detected, return success
+               else { return true; }
+       }
+
+       // no valid alignment to load
+       return false;
+}
+
+void BamReader::SetCalculateAlignedBases(bool flag) {
+       m_isCalculateAlignedBases = flag;
+}
+
+int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
+
+       // get region boundaries
+       uint32_t begin = left;
+       uint32_t end   = m_references.at(refID).RefLength - 1;
+
+       // initialize list, bin '0' always a valid bin
+       int i = 0;
+       list[i++] = 0;
+
+       // get rest of bins that contain this region
+       unsigned int k;
+       for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
+       for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
+       for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
+       for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
+       for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
+       
+       // return number of bins stored
+       return i;
+}
+
+uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
+
+       // initialize alignment end to starting position
+       uint32_t alignEnd = position;
+
+       // iterate over cigar operations
+       vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
+       vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
+       for ( ; cigarIter != cigarEnd; ++cigarIter) {
+               if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
+                       alignEnd += (*cigarIter).Length;
+               }
+       }
+       return alignEnd;
+}
+
+uint64_t BamReader::GetOffset(int refID, unsigned int left) {
+
+       //  make space for bins
+       uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);                 
+       
+       // returns number of bins overlapping (left, right)
+       // stores indices of those bins in 'bins'
+       int numBins = BinsFromRegion(refID, left, bins);                                
+
+       // access index data for refID
+       RefIndex* refIndex = m_index->at(refID);
+
+       // get list of BAM bins for this reference sequence
+       BinVector* refBins = refIndex->first;
+
+       sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+
+       // declare ChunkVector
+       ChunkVector regionChunks;
+
+       // declaure LinearOffsetVector
+       LinearOffsetVector* linearOffsets = refIndex->second;
+
+       // some sort of linear offset vs bin index voodoo... not completely sure what's going here
+       uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+
+       BinVector::iterator binBegin = refBins->begin();
+       BinVector::iterator binEnd   = refBins->end();
+
+       // iterate over bins overlapping region, count chunks
+       for (int i = 0; i < numBins; ++i) {
+               
+               // look for bin with ID=bin[i]
+               BinVector::iterator binIter = binBegin;
+
+               for ( ; binIter != binEnd; ++binIter ) {
+               
+                       // if found, increment n_off by number of chunks for each bin
+                       if ( (*binIter).first == (uint32_t)bins[i] ) { 
+                               
+                               // iterate over chunks in that bin
+                               ChunkVector* binChunks = (*binIter).second;
+                               
+                               ChunkVector::iterator chunkIter = binChunks->begin();
+                               ChunkVector::iterator chunkEnd  = binChunks->end();
+                               for ( ; chunkIter != chunkEnd; ++chunkIter) {
+                               
+                                       // if right bound of pair is greater than min_off (linear offset value), store pair
+                                       if ( (*chunkIter).second > minOffset) { 
+                                               regionChunks.push_back( (*chunkIter) ); 
+                                       }
+                               }
+                       }
+               }
+       }
+
+       // clean up temp array
+       free(bins);
+
+       // there should be at least 1
+       assert(regionChunks.size() > 0);
+
+       // sort chunks by start position
+       sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
+
+       // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+       int numOffsets = regionChunks.size();   
+       for (int i = 1; i < numOffsets; ++i) {
+               if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
+                       regionChunks.at(i-1).second = regionChunks.at(i).first;
+               }
+       }
+       
+       // merge adjacent chunks
+       int l = 0;
+       for (int i = 1; i < numOffsets; ++i) {
+               // if adjacent, expand boundaries of (merged) chunk
+               if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
+                       regionChunks.at(l).second = regionChunks.at(i).second;
+               }
+               // else, move on to next (merged) chunk index
+               else { regionChunks.at(++l) = regionChunks.at(i); }
+       }
+
+       // return beginning file offset of first chunk for region
+       return regionChunks.at(0).first;
+}
+
+bool BamReader::IsOverlap(BamAlignment& bAlignment) {
+
+       // if on different reference sequence, quit
+       if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
+
+       // read starts after left boundary
+       if ( bAlignment.Position >= m_currentLeft) { return true; }
+
+       // get alignment end
+       uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
+
+       // return whether alignment end overlaps left boundary
+       return ( alignEnd >= m_currentLeft );
+}
+
+bool BamReader::LoadHeader(void) {
+
+       // check to see if proper BAM header
+       char buf[4];
+       if (bam_read(m_file, buf, 4) != 4) { return false; }
+       if (strncmp(buf, "BAM\001", 4)) {
+               fprintf(stderr, "wrong header type!\n");
+               return false;
+       }
+       
+       // get BAM header text length
+       int32_t headerTextLength;
+       bam_read(m_file, &headerTextLength, 4);
+
+       // get BAM header text
+       char* headerText = (char*)calloc(headerTextLength + 1, 1);
+       bam_read(m_file, headerText, headerTextLength);
+       m_headerText = (string)((const char*)headerText);
+       
+       // clean up calloc-ed temp variable
+       free(headerText);
+
+       // get number of reference sequences
+       int32_t numberRefSeqs;
+       bam_read(m_file, &numberRefSeqs, 4);
+       if (numberRefSeqs == 0) { return false; }
+
+       m_references.reserve((int)numberRefSeqs);
+       
+       // reference variables
+       int32_t  refNameLength;
+       char*    refName;
+       uint32_t refLength;
+
+       // iterate over all references in header
+       for (int i = 0; i != numberRefSeqs; ++i) {
+
+               // get length of reference name
+               bam_read(m_file, &refNameLength, 4);
+               refName = (char*)calloc(refNameLength, 1);
+
+               // get reference name and reference sequence length
+               bam_read(m_file, refName, refNameLength);
+               bam_read(m_file, &refLength, 4);
+
+               // store data for reference
+               RefData aReference;
+               aReference.RefName   = (string)((const char*)refName);
+               aReference.RefLength = refLength;
+               m_references.push_back(aReference);
+
+               // clean up calloc-ed temp variable
+               free(refName);
+       }
+       
+       return true;
+}
+
+bool BamReader::LoadIndex(void) {
+
+       // check to see if index file exists
+       FILE* indexFile;
+       if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
+               fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
+               return false;
+       }
+
+       // see if index is valid BAM index
+       char magic[4];
+       fread(magic, 1, 4, indexFile);
+       if (strncmp(magic, "BAI\1", 4)) {
+               fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
+               fclose(indexFile);
+               return false;
+       }
+
+       // get number of reference sequences
+       uint32_t numRefSeqs;
+       fread(&numRefSeqs, 4, 1, indexFile);
+       
+       // intialize BamIndex data structure
+       m_index = new BamIndex;
+       m_index->reserve(numRefSeqs);
+
+       // iterate over reference sequences
+       for (unsigned int i = 0; i < numRefSeqs; ++i) {
+               
+               // get number of bins for this reference sequence
+               int32_t numBins;
+               fread(&numBins, 4, 1, indexFile);
+               
+               if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
+
+               // intialize BinVector
+               BinVector* bins = new BinVector;
+               bins->reserve(numBins);
+               
+               // iterate over bins for that reference sequence
+               for (int j = 0; j < numBins; ++j) {
+                       
+                       // get binID 
+                       uint32_t binID;
+                       fread(&binID, 4, 1, indexFile);
+                       
+                       // get number of regionChunks in this bin
+                       uint32_t numChunks;
+                       fread(&numChunks, 4, 1, indexFile);
+                       
+                       // intialize ChunkVector
+                       ChunkVector* regionChunks = new ChunkVector;
+                       regionChunks->reserve(numChunks);
+                       
+                       // iterate over regionChunks in this bin
+                       for (unsigned int k = 0; k < numChunks; ++k) {
+                               
+                               // get chunk boundaries (left, right) 
+                               uint64_t left;
+                               uint64_t right;
+                               fread(&left, 8, 1, indexFile);
+                               fread(&right, 8, 1, indexFile);
+                               
+                               // save ChunkPair
+                               regionChunks->push_back( ChunkPair(left, right) );
+                       }
+                       
+                       // save binID, chunkVector for this bin
+                       bins->push_back( BamBin(binID, regionChunks) );
+               }
+               
+               // load linear index for this reference sequence
+               
+               // get number of linear offsets
+               int32_t numLinearOffsets;
+               fread(&numLinearOffsets, 4, 1, indexFile);
+               
+               // intialize LinearOffsetVector
+               LinearOffsetVector* linearOffsets = new LinearOffsetVector;
+               linearOffsets->reserve(numLinearOffsets);
+               
+               // iterate over linear offsets for this reference sequeence
+               for (int j = 0; j < numLinearOffsets; ++j) {
+                       // get a linear offset
+                       uint64_t linearOffset;
+                       fread(&linearOffset, 8, 1, indexFile);
+                       // store linear offset
+                       linearOffsets->push_back(linearOffset);
+               }
+               
+               // store index data for that reference sequence
+               m_index->push_back( new RefIndex(bins, linearOffsets) );
+       }
+       
+       // close index file (.bai) and return
+       fclose(indexFile);
+       return true;
+}
+
+bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
+
+       // check valid alignment block header data
+       int32_t block_len;
+       int32_t ret;
+       uint32_t x[8];
+
+       int32_t bytesRead = 0;
+
+       // read in the 'block length' value, make sure it's not zero
+       if ( (ret = bam_read(m_file, &block_len, 4)) == 0 )        { return false; }
+       bytesRead += 4;
+
+       // read in core alignment data, make the right size of data was read 
+       if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
+       bytesRead += BAM_CORE_SIZE;
+
+       // set BamAlignment 'core' data
+       bAlignment.RefID         = x[0]; 
+       bAlignment.Position      = x[1];
+       bAlignment.Bin           = x[2]>>16; 
+       bAlignment.MapQuality    = x[2]>>8&0xff; 
+       bAlignment.AlignmentFlag = x[3]>>16; 
+       bAlignment.MateRefID     = x[5]; 
+       bAlignment.MatePosition  = x[6]; 
+       bAlignment.InsertSize    = x[7];
+
+       // fetch & store often-used lengths for character data parsing
+       unsigned int queryNameLength     = x[2]&0xff;
+       unsigned int numCigarOperations  = x[3]&0xffff;
+       unsigned int querySequenceLength = x[4];
+       
+       // get length of character data
+       int dataLength = block_len - BAM_CORE_SIZE;
+
+       // set up destination buffers for character data
+       uint8_t*  allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
+       uint32_t* cigarData   = (uint32_t*)(allCharData+queryNameLength);
+
+       const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
+       const unsigned int tagDataLen = dataLength - tagDataOffset;
+       char* tagData = ((char*)allCharData) + tagDataOffset;
+       
+       // get character data - make sure proper data size was read
+       if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
+       else {
+
+               bytesRead += dataLength;
+
+               // clear out bases, qualities, aligned bases, CIGAR, and tag data
+               bAlignment.QueryBases.clear();
+               bAlignment.Qualities.clear();
+               bAlignment.AlignedBases.clear();
+               bAlignment.CigarData.clear();
+               bAlignment.TagData.clear();
+
+               // save name
+               bAlignment.Name = (string)((const char*)(allCharData));
+               
+               // save bases
+               char singleBase[2];
+               uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
+               for (unsigned int i = 0; i < querySequenceLength; ++i) { 
+                       // numeric to char conversion
+                       sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
+                       // append character data to Bases
+                       bAlignment.QueryBases.append( (const char*)singleBase );
+               }
+               
+               // save sequence length
+               bAlignment.Length = bAlignment.QueryBases.length();
+               
+               // save qualities
+               char singleQuality[4];
+               uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
+               for (unsigned int i = 0; i < querySequenceLength; ++i) { 
+                       // numeric to char conversion
+                       sprintf( singleQuality, "%c", ( t[i]+33 ) ); 
+                       // append character data to Qualities
+                       bAlignment.Qualities.append( (const char*)singleQuality );
+               }
+               
+               // save CIGAR-related data;
+               int k = 0;
+               for (unsigned int i = 0; i < numCigarOperations; ++i) {
+
+                       // build CigarOp struct
+                       CigarOp op;
+                       op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
+                       op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
+
+                       // save CigarOp
+                       bAlignment.CigarData.push_back(op);
+
+                       // can skip this step if user wants to ignore
+                       if (m_isCalculateAlignedBases) {
+
+                               // build AlignedBases string
+                               switch (op.Type) {
+                                       
+                                       case ('M') : 
+                                       case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
+                                       case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
+                                                                break;
+                                       
+                                       case ('D') : 
+                                       case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;
+                                                                break;
+                                       
+                                       case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
+                                                                k += op.Length;
+                                                                break;
+
+                                       case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
+                                       
+                                       default    : assert(false);     // shouldn't get here
+                                                                break;
+                               }
+                       }
+               }
+
+               // read in the tag data
+               bAlignment.TagData.resize(tagDataLen);
+               memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
+       }
+       free(allCharData);
+
+/*
+       // (optional) read tag parsing data
+       string tag;
+       char data;
+       int i = 0;
+
+       // still data to read (auxiliary tags)
+       while ( bytesRead < block_len ) {
+
+               if ( bam_read(m_file, &data, 1) == 1 ) { 
+                       
+                       ++bytesRead;
+
+                       if (bytesRead == block_len && data != '\0') {
+                               fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
+                               exit(1);
+                       }
+
+                       tag.append(1, data);
+                       if ( data == '\0' ) {
+                               bAlignment.Tags.push_back(tag);
+                               tag = "";
+                               i = 0;
+                       } else {
+                               if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
+                               ++i;
+                       }
+               } else {
+                       fprintf(stderr, "ERROR: Could not read tag info.\n");
+                       exit(1);
+               }
+       }
+*/
+       return true;
+}
index f0c67e22ceeb104ab5b7edac58bbdff4387817df..4f6a37f3592d8426be5d52da0a4e6c3257b90295 100644 (file)
@@ -357,13 +357,8 @@ void BamWriter::SaveAlignment(const BamAlignment& al) {
        EncodeQuerySequence(al.QueryBases, encodedQuery);
        const unsigned int encodedQueryLen = encodedQuery.size();
 
-       // create our read group tag
-       // TODO: add read group tag support when it shows up in the BamAlignment struct
-       //string readGroupTag;
-       //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
-       //readGroupTag.resize(readGroupTagLen);
-       //char* pReadGroupTag = (char*)readGroupTag.data();
-       //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
+       // store the tag data length
+       const unsigned int tagDataLength = al.TagData.size();
 
        // assign the BAM core data
        unsigned int buffer[8];
@@ -377,7 +372,7 @@ void BamWriter::SaveAlignment(const BamAlignment& al) {
        buffer[7] = al.InsertSize;
 
        // write the block size
-       const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
+       const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
        const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
 
        BgzfWrite((char*)&blockSize, SIZEOF_INT);
@@ -401,6 +396,5 @@ void BamWriter::SaveAlignment(const BamAlignment& al) {
        BgzfWrite(pBaseQualities, queryLen);
 
        // write the read group tag
-       // TODO: add read group tag support when it shows up in the BamAlignment struct
-       //BgzfWrite(readGroupTag.c_str(), readGroupTagLen);
+       BgzfWrite(al.TagData.data(), tagDataLength);
 }