1 // ***************************************************************************
2 // BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 15 July 2009 (DB)
7 // ---------------------------------------------------------------------------
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for reading BAM files
12 // ***************************************************************************
15 #include "BamReader.h"
16 using namespace BamTools;
19 // static character constants
20 const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
21 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
24 BamReader::BamReader(void)
27 , m_isIndexLoaded(false)
28 , m_alignmentsBeginOffset(0)
29 , m_isRegionSpecified(false)
35 BamReader::~BamReader(void) {
39 // checks BGZF block header
40 bool BamReader::BgzfCheckBlockHeader(char* header) {
42 return (header[0] == GZIP_ID1 &&
43 header[1] == (char)GZIP_ID2 &&
44 header[2] == Z_DEFLATED &&
45 (header[3] & FLG_FEXTRA) != 0 &&
46 BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
47 header[12] == BGZF_ID1 &&
48 header[13] == BGZF_ID2 &&
49 BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
53 // closes the BAM file
54 void BamReader::BgzfClose(void) {
55 fflush(m_BGZF->Stream);
56 fclose(m_BGZF->Stream);
57 m_BGZF->IsOpen = false;
60 // de-compresses the current block
61 int BamReader::BgzfInflateBlock(int blockLength) {
63 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
67 zs.next_in = (Bytef*)m_BGZF->CompressedBlock + 18;
68 zs.avail_in = blockLength - 16;
69 zs.next_out = (Bytef*)m_BGZF->UncompressedBlock;
70 zs.avail_out = m_BGZF->UncompressedBlockSize;
72 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
74 printf("inflateInit failed\n");
78 status = inflate(&zs, Z_FINISH);
79 if (status != Z_STREAM_END) {
81 printf("inflate failed\n");
85 status = inflateEnd(&zs);
87 printf("inflateEnd failed\n");
94 // opens the BAM file for reading
95 void BamReader::BgzfOpen(const string& filename) {
97 m_BGZF->Stream = fopen(filename.c_str(), "rb");
99 printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
103 m_BGZF->IsOpen = true;
106 // reads BGZF data into buffer
107 unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
109 if (dataLength == 0) { return 0; }
112 unsigned int numBytesRead = 0;
113 while (numBytesRead < dataLength) {
115 int bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
116 if (bytesAvailable <= 0) {
117 if ( BgzfReadBlock() != 0 ) { return -1; }
118 bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
119 if ( bytesAvailable <= 0 ) { break; }
122 char* buffer = m_BGZF->UncompressedBlock;
123 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
124 memcpy(output, buffer + m_BGZF->BlockOffset, copyLength);
126 m_BGZF->BlockOffset += copyLength;
127 output += copyLength;
128 numBytesRead += copyLength;
131 if ( m_BGZF->BlockOffset == m_BGZF->BlockLength ) {
132 m_BGZF->BlockAddress = ftello(m_BGZF->Stream);
133 m_BGZF->BlockOffset = 0;
134 m_BGZF->BlockLength = 0;
140 int BamReader::BgzfReadBlock(void) {
142 char header[BLOCK_HEADER_LENGTH];
143 int64_t blockAddress = ftello(m_BGZF->Stream);
145 int count = fread(header, 1, sizeof(header), m_BGZF->Stream);
147 m_BGZF->BlockLength = 0;
151 if (count != sizeof(header)) {
152 printf("read block failed - count != sizeof(header)\n");
156 if (!BgzfCheckBlockHeader(header)) {
157 printf("read block failed - CheckBgzfBlockHeader() returned false\n");
161 int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
162 char* compressedBlock = m_BGZF->CompressedBlock;
163 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
164 int remaining = blockLength - BLOCK_HEADER_LENGTH;
166 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF->Stream);
167 if (count != remaining) {
168 printf("read block failed - count != remaining\n");
172 count = BgzfInflateBlock(blockLength);
173 if (count < 0) { return -1; }
175 if (m_BGZF->BlockLength != 0) {
176 m_BGZF->BlockOffset = 0;
179 m_BGZF->BlockAddress = blockAddress;
180 m_BGZF->BlockLength = count;
184 // move file pointer to specified offset
185 bool BamReader::BgzfSeek(int64_t position) {
187 int blockOffset = (position & 0xFFFF);
188 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
189 if (fseeko(m_BGZF->Stream, blockAddress, SEEK_SET) != 0) {
190 printf("ERROR: Unable to seek in BAM file\n");
194 m_BGZF->BlockLength = 0;
195 m_BGZF->BlockAddress = blockAddress;
196 m_BGZF->BlockOffset = blockOffset;
200 // get file position in BAM file
201 int64_t BamReader::BgzfTell(void) {
202 return ( (m_BGZF->BlockAddress << 16) | (m_BGZF->BlockOffset & 0xFFFF) );
205 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
207 // get region boundaries
208 uint32_t begin = left;
209 uint32_t end = m_references.at(refID).RefLength - 1;
211 // initialize list, bin '0' always a valid bin
215 // get rest of bins that contain this region
217 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
218 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
219 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
220 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
221 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
223 // return number of bins stored
227 unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
229 // initialize alignment end to starting position
230 unsigned int alignEnd = position;
232 // iterate over cigar operations
233 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
234 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
235 for ( ; cigarIter != cigarEnd; ++cigarIter) {
236 char cigarType = (*cigarIter).Type;
237 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
238 alignEnd += (*cigarIter).Length;
244 void BamReader::ClearIndex(void) {
247 // iterate over references
248 vector<RefIndex*>::iterator refIter = m_index->begin();
249 vector<RefIndex*>::iterator refEnd = m_index->end();
250 for ( ; refIter != refEnd; ++refIter) {
251 RefIndex* aRef = (*refIter);
253 // clear out BAM bins
255 BinVector::iterator binIter = (aRef->first)->begin();
256 BinVector::iterator binEnd = (aRef->first)->end();
257 for ( ; binIter != binEnd; ++binIter ) {
258 ChunkVector* chunks = (*binIter).second;
259 if ( chunks ) { delete chunks; chunks = NULL;}
264 // clear BAM linear offsets
265 if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
275 // closes the BAM file
276 void BamReader::Close(void) {
278 if (m_BGZF!=NULL && m_BGZF->IsOpen) {
284 m_headerText.clear();
285 m_isRegionSpecified = false;
288 const string BamReader::GetHeaderText(void) const {
292 const int BamReader::GetReferenceCount(void) const {
293 return m_references.size();
296 const RefVector BamReader::GetReferenceData(void) const {
300 const int BamReader::GetReferenceID(const string& refName) const {
302 // retrieve names from reference data
303 vector<string> refNames;
304 RefVector::const_iterator refIter = m_references.begin();
305 RefVector::const_iterator refEnd = m_references.end();
306 for ( ; refIter != refEnd; ++refIter) {
307 refNames.push_back( (*refIter).RefName );
310 // return 'index-of' refName ( if not found, returns refNames.size() )
311 return Index( refNames.begin(), refNames.end(), refName );
314 // get next alignment (from specified region, if given)
315 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
317 // if valid alignment available
318 if ( LoadNextAlignment(bAlignment) ) {
320 // if region not specified, return success
321 if ( !m_isRegionSpecified ) { return true; }
323 // load next alignment until region overlap is found
324 while ( !IsOverlap(bAlignment) ) {
325 // if no valid alignment available (likely EOF) return failure
326 if ( !LoadNextAlignment(bAlignment) ) { return false; }
329 // return success (alignment found that overlaps region)
333 // no valid alignment
334 else { return false; }
337 int64_t BamReader::GetOffset(int refID, unsigned int left) {
339 // calculate which bins overlap this region
340 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
341 int numBins = BinsFromRegion(refID, left, bins);
343 // get bins for this reference
344 RefIndex* refIndex = m_index->at(refID);
345 BinVector* refBins = refIndex->first;
347 // get minimum offset to consider
348 LinearOffsetVector* linearOffsets = refIndex->second;
349 uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
351 // store offsets to beginning of alignment 'chunks'
352 std::vector<int64_t> chunkStarts;
354 // reference bin iterators
355 BinVector::const_iterator binIter;
356 BinVector::const_iterator binBegin = refBins->begin();
357 BinVector::const_iterator binEnd = refBins->end();
359 // store all alignment 'chunk' starts for bins in this region
360 for (int i = 0; i < numBins; ++i ) {
361 binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
362 if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
363 ChunkVector* binChunks = (*binIter).second;
364 ChunkVector::const_iterator chunkIter = binChunks->begin();
365 ChunkVector::const_iterator chunkEnd = binChunks->end();
366 for ( ; chunkIter != chunkEnd; ++chunkIter) {
367 if ( (*chunkIter).second > minOffset ) {
368 chunkStarts.push_back( (*chunkIter).first );
377 // if no alignments found
378 if ( chunkStarts.empty() ) { return -1; }
380 // else return smallest offset for alignment starts
381 else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
384 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
386 // if on different reference sequence, quit
387 if ( bAlignment.RefID != m_currentRefID ) { return false; }
389 // read starts after left boundary
390 if ( bAlignment.Position >= (signed int) m_currentLeft) { return true; }
392 // return whether alignment end overlaps left boundary
393 return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
396 bool BamReader::Jump(int refID, unsigned int position) {
398 // if index available, and region is valid
399 if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) {
400 m_currentRefID = refID;
401 m_currentLeft = position;
402 m_isRegionSpecified = true;
404 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
405 if ( offset == -1 ) { return false; }
406 else { return BgzfSeek(offset); }
411 void BamReader::LoadHeaderData(void) {
413 // check to see if proper BAM header
415 if (BgzfRead(buffer, 4) != 4) {
416 printf("Could not read header type\n");
420 if (strncmp(buffer, "BAM\001", 4)) {
421 printf("wrong header type!\n");
425 // get BAM header text length
427 const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
429 // get BAM header text
430 char* headerText = (char*)calloc(headerTextLength + 1, 1);
431 BgzfRead(headerText, headerTextLength);
432 m_headerText = (string)((const char*)headerText);
434 // clean up calloc-ed temp variable
438 void BamReader::LoadIndexData(FILE* indexStream) {
440 // see if index is valid BAM index
442 fread(magic, 1, 4, indexStream);
443 if (strncmp(magic, "BAI\1", 4)) {
444 printf("Problem with index file - invalid format.\n");
449 // get number of reference sequences
451 fread(&numRefSeqs, 4, 1, indexStream);
453 // intialize BamIndex data structure
454 m_index = new BamIndex;
455 m_index->reserve(numRefSeqs);
457 // iterate over reference sequences
458 for (unsigned int i = 0; i < numRefSeqs; ++i) {
460 // get number of bins for this reference sequence
462 fread(&numBins, 4, 1, indexStream);
464 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
466 // intialize BinVector
467 BinVector* bins = new BinVector;
468 bins->reserve(numBins);
470 // iterate over bins for that reference sequence
471 for (int j = 0; j < numBins; ++j) {
475 fread(&binID, 4, 1, indexStream);
477 // get number of regionChunks in this bin
479 fread(&numChunks, 4, 1, indexStream);
481 // intialize ChunkVector
482 ChunkVector* regionChunks = new ChunkVector;
483 regionChunks->reserve(numChunks);
485 // iterate over regionChunks in this bin
486 for (unsigned int k = 0; k < numChunks; ++k) {
488 // get chunk boundaries (left, right)
491 fread(&left, 8, 1, indexStream);
492 fread(&right, 8, 1, indexStream);
495 regionChunks->push_back( ChunkPair(left, right) );
498 // sort chunks for this bin
499 sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
501 // save binID, chunkVector for this bin
502 bins->push_back( BamBin(binID, regionChunks) );
505 // sort bins by binID
506 sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
508 // load linear index for this reference sequence
510 // get number of linear offsets
511 int32_t numLinearOffsets;
512 fread(&numLinearOffsets, 4, 1, indexStream);
514 // intialize LinearOffsetVector
515 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
516 linearOffsets->reserve(numLinearOffsets);
518 // iterate over linear offsets for this reference sequeence
519 for (int j = 0; j < numLinearOffsets; ++j) {
520 // get a linear offset
521 uint64_t linearOffset;
522 fread(&linearOffset, 8, 1, indexStream);
523 // store linear offset
524 linearOffsets->push_back(linearOffset);
527 // sort linear offsets
528 sort( linearOffsets->begin(), linearOffsets->end() );
530 // store index data for that reference sequence
531 m_index->push_back( new RefIndex(bins, linearOffsets) );
534 // close index file (.bai) and return
538 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
540 // read in the 'block length' value, make sure it's not zero
543 const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
544 if ( blockLength == 0 ) { return false; }
546 // keep track of bytes read as method progresses
549 // read in core alignment data, make sure the right size of data was read
550 char x[BAM_CORE_SIZE];
551 if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
552 bytesRead += BAM_CORE_SIZE;
554 // set BamAlignment 'core' data and character data lengths
555 unsigned int tempValue;
556 unsigned int queryNameLength;
557 unsigned int numCigarOperations;
558 unsigned int querySequenceLength;
560 //bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]);
561 //bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
562 bAlignment.RefID = BgzfUnpackSignedInt(&x[0]);
563 bAlignment.Position = BgzfUnpackSignedInt(&x[4]);
565 tempValue = BgzfUnpackUnsignedInt(&x[8]);
566 bAlignment.Bin = tempValue >> 16;
567 bAlignment.MapQuality = tempValue >> 8 & 0xff;
568 queryNameLength = tempValue & 0xff;
570 tempValue = BgzfUnpackUnsignedInt(&x[12]);
571 bAlignment.AlignmentFlag = tempValue >> 16;
572 numCigarOperations = tempValue & 0xffff;
574 querySequenceLength = BgzfUnpackUnsignedInt(&x[16]);
575 //bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]);
576 //bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
577 //bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]);
578 bAlignment.MateRefID = BgzfUnpackSignedInt(&x[20]);
579 bAlignment.MatePosition = BgzfUnpackSignedInt(&x[24]);
580 bAlignment.InsertSize = BgzfUnpackSignedInt(&x[28]);
582 // calculate lengths/offsets
583 const unsigned int dataLength = blockLength - BAM_CORE_SIZE;
584 const unsigned int cigarDataOffset = queryNameLength;
585 const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);
586 const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;
587 const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;
588 const unsigned int tagDataLen = dataLength - tagDataOffset;
590 // set up destination buffers for character data
591 char* allCharData = (char*)calloc(sizeof(char), dataLength);
592 uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
593 char* seqData = ((char*)allCharData) + seqDataOffset;
594 char* qualData = ((char*)allCharData) + qualDataOffset;
595 char* tagData = ((char*)allCharData) + tagDataOffset;
597 // get character data - make sure proper data size was read
598 if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
601 bytesRead += dataLength;
603 // clear out any previous string data
604 bAlignment.Name.clear();
605 bAlignment.QueryBases.clear();
606 bAlignment.Qualities.clear();
607 bAlignment.AlignedBases.clear();
608 bAlignment.CigarData.clear();
609 bAlignment.TagData.clear();
612 bAlignment.Name = (string)((const char*)(allCharData));
614 // save query sequence
615 for (unsigned int i = 0; i < querySequenceLength; ++i) {
616 char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
617 bAlignment.QueryBases.append( 1, singleBase );
620 // save sequence length
621 bAlignment.Length = bAlignment.QueryBases.length();
624 for (unsigned int i = 0; i < querySequenceLength; ++i) {
625 char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character
626 bAlignment.Qualities.append( 1, singleQuality );
629 // save CIGAR-related data;
631 for (unsigned int i = 0; i < numCigarOperations; ++i) {
633 // build CigarOp struct
635 op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
636 op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
639 bAlignment.CigarData.push_back(op);
641 // build AlignedBases string
645 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
646 case ('S') : k += op.Length; // for 'S' - skip over query bases
649 case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character
652 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;
655 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
659 case ('H') : break; // for 'H' - do nothing, move to next op
661 default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
666 // read in the tag data
667 bAlignment.TagData.resize(tagDataLen);
668 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
675 void BamReader::LoadReferenceData(void) {
677 // get number of reference sequences
680 const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
681 if (numberRefSeqs == 0) { return; }
682 m_references.reserve((int)numberRefSeqs);
684 // iterate over all references in header
685 for (unsigned int i = 0; i != numberRefSeqs; ++i) {
687 // get length of reference name
689 const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
690 char* refName = (char*)calloc(refNameLength, 1);
692 // get reference name and reference sequence length
693 BgzfRead(refName, refNameLength);
695 const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
697 // store data for reference
699 aReference.RefName = (string)((const char*)refName);
700 aReference.RefLength = refLength;
701 m_references.push_back(aReference);
703 // clean up calloc-ed temp variable
708 // opens BAM file (and index)
709 void BamReader::Open(const string& filename, const string& indexFilename) {
711 // open the BGZF file for reading, retrieve header text & reference data
712 m_BGZF = new BgzfData;
717 // store file offset of first alignment
718 m_alignmentsBeginOffset = BgzfTell();
720 // open index file & load index data (if exists)
721 OpenIndex(indexFilename);
724 void BamReader::OpenIndex(const string& indexFilename) {
726 // if index file exists
727 if (!indexFilename.empty()) {
730 FILE* indexStream = fopen(indexFilename.c_str(), "rb");
734 printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
738 // build up index data structure
739 LoadIndexData(indexStream);
743 bool BamReader::Rewind(void) {
745 // find first reference that has alignments in the BAM file
747 int refCount = m_references.size();
748 for ( ; refID < refCount; ++refID ) {
749 if ( m_references.at(refID).RefHasAlignments ) { break; }
752 // store default bounds for first alignment
753 m_currentRefID = refID;
755 m_isRegionSpecified = false;
757 // return success/failure of seek
758 return BgzfSeek(m_alignmentsBeginOffset);