1 // ***************************************************************************
2 // BamReader (c) 2009 Derek Barnett
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Provides the basic functionality for reading BAM files
7 // ***************************************************************************
10 //#include "STLUtilities.h"
18 // static character constants
19 const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
20 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
23 BamReader::BamReader(void)
25 , m_isIndexLoaded(false)
26 , m_alignmentsBeginOffset(0)
27 , m_isRegionSpecified(false)
33 BamReader::~BamReader(void) {
39 // checks BGZF block header
40 bool BamReader::BgzfCheckBlockHeader(char* header) {
42 return (header[0] == GZIP_ID1 &&
43 header[1] == (char)GZIP_ID2 &&
44 header[2] == Z_DEFLATED &&
45 (header[3] & FLG_FEXTRA) != 0 &&
46 BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
47 header[12] == BGZF_ID1 &&
48 header[13] == BGZF_ID2 &&
49 BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN);
52 // closes the BAM file
53 void BamReader::BgzfClose(void) {
54 m_BGZF.IsOpen = false;
55 fclose(m_BGZF.Stream);
58 // de-compresses the current block
59 int BamReader::BgzfInflateBlock(int blockLength) {
61 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
65 zs.next_in = (Bytef*)m_BGZF.CompressedBlock + 18;
66 zs.avail_in = blockLength - 16;
67 zs.next_out = (Bytef*)m_BGZF.UncompressedBlock;
68 zs.avail_out = m_BGZF.UncompressedBlockSize;
70 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
72 printf("inflateInit failed\n");
76 status = inflate(&zs, Z_FINISH);
77 if (status != Z_STREAM_END) {
79 printf("inflate failed\n");
83 status = inflateEnd(&zs);
85 printf("inflateEnd failed\n");
92 // opens the BAM file for reading
93 void BamReader::BgzfOpen(const string& filename) {
95 m_BGZF.Stream = fopen(filename.c_str(), "rb");
97 cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl;
101 m_BGZF.IsOpen = true;
104 // reads BGZF data into buffer
105 unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
107 if (dataLength == 0) { return 0; }
110 unsigned int numBytesRead = 0;
111 while (numBytesRead < dataLength) {
113 int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
114 if (bytesAvailable <= 0) {
115 if ( BgzfReadBlock() != 0 ) { return -1; }
116 bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
117 if ( bytesAvailable <= 0 ) { break; }
120 char* buffer = m_BGZF.UncompressedBlock;
121 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
122 memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
124 m_BGZF.BlockOffset += copyLength;
125 output += copyLength;
126 numBytesRead += copyLength;
129 if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
130 m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);
131 m_BGZF.BlockOffset = 0;
132 m_BGZF.BlockLength = 0;
138 int BamReader::BgzfReadBlock(void) {
140 char header[BLOCK_HEADER_LENGTH];
141 int64_t blockAddress = ftello(m_BGZF.Stream);
143 int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
145 m_BGZF.BlockLength = 0;
149 if (count != sizeof(header)) {
150 printf("read block failed - count != sizeof(header)\n");
154 if (!BgzfCheckBlockHeader(header)) {
155 printf("read block failed - CheckBgzfBlockHeader() returned false\n");
159 int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
160 char* compressedBlock = m_BGZF.CompressedBlock;
161 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
162 int remaining = blockLength - BLOCK_HEADER_LENGTH;
164 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
165 if (count != remaining) {
166 printf("read block failed - count != remaining\n");
170 count = BgzfInflateBlock(blockLength);
171 if (count < 0) { return -1; }
173 if (m_BGZF.BlockLength != 0) {
174 m_BGZF.BlockOffset = 0;
177 m_BGZF.BlockAddress = blockAddress;
178 m_BGZF.BlockLength = count;
182 // move file pointer to specified offset
183 bool BamReader::BgzfSeek(int64_t position) {
185 int blockOffset = (position & 0xFFFF);
186 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
187 if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
188 printf("ERROR: Unable to seek in BAM file\n");
192 m_BGZF.BlockLength = 0;
193 m_BGZF.BlockAddress = blockAddress;
194 m_BGZF.BlockOffset = blockOffset;
198 // get file position in BAM file
199 int64_t BamReader::BgzfTell(void) {
200 return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
203 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
205 // get region boundaries
206 uint32_t begin = left;
207 uint32_t end = m_references.at(refID).RefLength - 1;
209 // initialize list, bin '0' always a valid bin
213 // get rest of bins that contain this region
215 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
216 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
217 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
218 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
219 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
221 // return number of bins stored
225 unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
227 // initialize alignment end to starting position
228 unsigned int alignEnd = position;
230 // iterate over cigar operations
231 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
232 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
233 for ( ; cigarIter != cigarEnd; ++cigarIter) {
234 char cigarType = (*cigarIter).Type;
235 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
236 alignEnd += (*cigarIter).Length;
242 void BamReader::ClearIndex(void) {
245 // iterate over references
246 vector<RefIndex*>::iterator refIter = m_index->begin();
247 vector<RefIndex*>::iterator refEnd = m_index->end();
248 for ( ; refIter != refEnd; ++refIter) {
249 RefIndex* aRef = (*refIter);
251 // clear out BAM bins
253 BinVector::iterator binIter = (aRef->first)->begin();
254 BinVector::iterator binEnd = (aRef->first)->end();
255 for ( ; binIter != binEnd; ++binIter ) {
256 ChunkVector* chunks = (*binIter).second;
257 if ( chunks ) { delete chunks; }
261 // clear BAM linear offsets
262 if ( aRef->second ) { delete aRef->second; }
270 // closes the BAM file
271 void BamReader::Close(void) {
272 if(m_BGZF.IsOpen) { BgzfClose(); }
273 if(m_index) { delete m_index; m_index = NULL; }
274 m_isRegionSpecified = false;
277 const string BamReader::GetHeaderText(void) const {
281 const int BamReader::GetReferenceCount(void) const {
282 return m_references.size();
285 const RefVector BamReader::GetReferenceData(void) const {
289 const int BamReader::GetReferenceID(const string& refName) const {
291 // retrieve names from reference data
292 vector<string> refNames;
293 RefVector::const_iterator refIter = m_references.begin();
294 RefVector::const_iterator refEnd = m_references.end();
295 for ( ; refIter != refEnd; ++refIter) {
296 refNames.push_back( (*refIter).RefName );
299 // return 'index-of' refName ( if not found, returns refNames.size() )
300 return Index( refNames.begin(), refNames.end(), refName );
303 // get next alignment (from specified region, if given)
304 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
306 // if valid alignment available
307 if ( LoadNextAlignment(bAlignment) ) {
309 // if region not specified, return success
310 if ( !m_isRegionSpecified ) { return true; }
312 // load next alignment until region overlap is found
313 while ( !IsOverlap(bAlignment) ) {
314 // if no valid alignment available (likely EOF) return failure
315 if ( !LoadNextAlignment(bAlignment) ) { return false; }
318 // return success (alignment found that overlaps region)
322 // no valid alignment
323 else { return false; }
326 int64_t BamReader::GetOffset(int refID, unsigned int left) {
328 // calculate which bins overlap this region
329 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
330 int numBins = BinsFromRegion(refID, left, bins);
332 // get bins for this reference
333 RefIndex* refIndex = m_index->at(refID);
334 BinVector* refBins = refIndex->first;
336 // get minimum offset to consider
337 LinearOffsetVector* linearOffsets = refIndex->second;
338 uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
340 // store offsets to beginning of alignment 'chunks'
341 std::vector<int64_t> chunkStarts;
343 // reference bin iterators
344 BinVector::const_iterator binIter;
345 BinVector::const_iterator binBegin = refBins->begin();
346 BinVector::const_iterator binEnd = refBins->end();
348 // store all alignment 'chunk' starts for bins in this region
349 for (int i = 0; i < numBins; ++i ) {
350 binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
351 if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
352 ChunkVector* binChunks = (*binIter).second;
353 ChunkVector::const_iterator chunkIter = binChunks->begin();
354 ChunkVector::const_iterator chunkEnd = binChunks->end();
355 for ( ; chunkIter != chunkEnd; ++chunkIter) {
356 if ( (*chunkIter).second > minOffset ) {
357 chunkStarts.push_back( (*chunkIter).first );
366 // if no alignments found
367 if ( chunkStarts.empty() ) { return -1; }
369 // else return smallest offset for alignment starts
370 else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
373 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
375 // if on different reference sequence, quit
376 if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
378 // read starts after left boundary
379 if ( bAlignment.Position >= m_currentLeft) { return true; }
381 // return whether alignment end overlaps left boundary
382 return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
385 bool BamReader::Jump(int refID, unsigned int left) {
387 // if index available, and region is valid
388 if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
389 m_currentRefID = refID;
390 m_currentLeft = left;
391 m_isRegionSpecified = true;
393 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
394 if ( offset == -1 ) { return false; }
395 else { return BgzfSeek(offset); }
400 void BamReader::LoadHeaderData(void) {
402 // check to see if proper BAM header
404 if (BgzfRead(buffer, 4) != 4) {
405 printf("Could not read header type\n");
408 if (strncmp(buffer, "BAM\001", 4)) {
409 printf("wrong header type!\n");
413 // get BAM header text length
415 const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
417 // get BAM header text
418 char* headerText = (char*)calloc(headerTextLength + 1, 1);
419 BgzfRead(headerText, headerTextLength);
420 m_headerText = (string)((const char*)headerText);
422 // clean up calloc-ed temp variable
426 void BamReader::LoadIndexData(FILE* indexStream) {
428 // see if index is valid BAM index
430 fread(magic, 1, 4, indexStream);
431 if (strncmp(magic, "BAI\1", 4)) {
432 printf("Problem with index file - invalid format.\n");
437 // get number of reference sequences
439 fread(&numRefSeqs, 4, 1, indexStream);
441 // intialize BamIndex data structure
442 m_index = new BamIndex;
443 m_index->reserve(numRefSeqs);
445 // iterate over reference sequences
446 for (unsigned int i = 0; i < numRefSeqs; ++i) {
448 // get number of bins for this reference sequence
450 fread(&numBins, 4, 1, indexStream);
452 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
454 // intialize BinVector
455 BinVector* bins = new BinVector;
456 bins->reserve(numBins);
458 // iterate over bins for that reference sequence
459 for (int j = 0; j < numBins; ++j) {
463 fread(&binID, 4, 1, indexStream);
465 // get number of regionChunks in this bin
467 fread(&numChunks, 4, 1, indexStream);
469 // intialize ChunkVector
470 ChunkVector* regionChunks = new ChunkVector;
471 regionChunks->reserve(numChunks);
473 // iterate over regionChunks in this bin
474 for (unsigned int k = 0; k < numChunks; ++k) {
476 // get chunk boundaries (left, right)
479 fread(&left, 8, 1, indexStream);
480 fread(&right, 8, 1, indexStream);
483 regionChunks->push_back( ChunkPair(left, right) );
486 // sort chunks for this bin
487 sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
489 // save binID, chunkVector for this bin
490 bins->push_back( BamBin(binID, regionChunks) );
493 // sort bins by binID
494 sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
496 // load linear index for this reference sequence
498 // get number of linear offsets
499 int32_t numLinearOffsets;
500 fread(&numLinearOffsets, 4, 1, indexStream);
502 // intialize LinearOffsetVector
503 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
504 linearOffsets->reserve(numLinearOffsets);
506 // iterate over linear offsets for this reference sequeence
507 for (int j = 0; j < numLinearOffsets; ++j) {
508 // get a linear offset
509 uint64_t linearOffset;
510 fread(&linearOffset, 8, 1, indexStream);
511 // store linear offset
512 linearOffsets->push_back(linearOffset);
515 // sort linear offsets
516 sort( linearOffsets->begin(), linearOffsets->end() );
518 // store index data for that reference sequence
519 m_index->push_back( new RefIndex(bins, linearOffsets) );
522 // close index file (.bai) and return
526 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
528 // read in the 'block length' value, make sure it's not zero
531 const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
532 if ( blockLength == 0 ) { return false; }
534 // keep track of bytes read as method progresses
537 // read in core alignment data, make sure the right size of data was read
538 char x[BAM_CORE_SIZE];
539 if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
540 bytesRead += BAM_CORE_SIZE;
542 // set BamAlignment 'core' data and character data lengths
543 unsigned int tempValue;
544 unsigned int queryNameLength;
545 unsigned int numCigarOperations;
546 unsigned int querySequenceLength;
548 bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]);
549 bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
551 tempValue = BgzfUnpackUnsignedInt(&x[8]);
552 bAlignment.Bin = tempValue >> 16;
553 bAlignment.MapQuality = tempValue >> 8 & 0xff;
554 queryNameLength = tempValue & 0xff;
556 tempValue = BgzfUnpackUnsignedInt(&x[12]);
557 bAlignment.AlignmentFlag = tempValue >> 16;
558 numCigarOperations = tempValue & 0xffff;
560 querySequenceLength = BgzfUnpackUnsignedInt(&x[16]);
561 bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]);
562 bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
563 bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]);
565 // calculate lengths/offsets
566 const unsigned int dataLength = blockLength - BAM_CORE_SIZE;
567 const unsigned int cigarDataOffset = queryNameLength;
568 const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);
569 const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;
570 const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;
571 const unsigned int tagDataLen = dataLength - tagDataOffset;
573 // set up destination buffers for character data
574 char* allCharData = (char*)calloc(sizeof(char), dataLength);
575 uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
576 char* seqData = ((char*)allCharData) + seqDataOffset;
577 char* qualData = ((char*)allCharData) + qualDataOffset;
578 char* tagData = ((char*)allCharData) + tagDataOffset;
580 // get character data - make sure proper data size was read
581 if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
584 bytesRead += dataLength;
586 // clear out any previous string data
587 bAlignment.Name.clear();
588 bAlignment.QueryBases.clear();
589 bAlignment.Qualities.clear();
590 bAlignment.AlignedBases.clear();
591 bAlignment.CigarData.clear();
592 bAlignment.TagData.clear();
595 bAlignment.Name = (string)((const char*)(allCharData));
597 // save query sequence
598 for (unsigned int i = 0; i < querySequenceLength; ++i) {
599 char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
600 bAlignment.QueryBases.append( 1, singleBase );
603 // save sequence length
604 bAlignment.Length = bAlignment.QueryBases.length();
607 for (unsigned int i = 0; i < querySequenceLength; ++i) {
608 char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character
609 bAlignment.Qualities.append( 1, singleQuality );
612 // save CIGAR-related data;
614 for (unsigned int i = 0; i < numCigarOperations; ++i) {
616 // build CigarOp struct
618 op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
619 op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
622 bAlignment.CigarData.push_back(op);
624 // build AlignedBases string
628 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
629 case ('S') : k += op.Length; // for 'S' - skip over query bases
633 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
636 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
640 case ('H') : break; // for 'H' - do nothing, move to next op
642 default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
647 // read in the tag data
648 bAlignment.TagData.resize(tagDataLen);
649 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
656 void BamReader::LoadReferenceData(void) {
658 // get number of reference sequences
661 const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
662 if (numberRefSeqs == 0) { return; }
663 m_references.reserve((int)numberRefSeqs);
665 // iterate over all references in header
666 for (unsigned int i = 0; i != numberRefSeqs; ++i) {
668 // get length of reference name
670 const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
671 char* refName = (char*)calloc(refNameLength, 1);
673 // get reference name and reference sequence length
674 BgzfRead(refName, refNameLength);
676 const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
678 // store data for reference
680 aReference.RefName = (string)((const char*)refName);
681 aReference.RefLength = refLength;
682 m_references.push_back(aReference);
684 // clean up calloc-ed temp variable
689 // opens BAM file (and index)
690 void BamReader::Open(const string& filename, const string& indexFilename) {
692 // open the BGZF file for reading, retrieve header text & reference data
697 // store file offset of first alignment
698 m_alignmentsBeginOffset = BgzfTell();
700 // open index file & load index data (if exists)
701 OpenIndex(indexFilename);
704 void BamReader::OpenIndex(const string& indexFilename) {
706 // if index file exists
707 if (!indexFilename.empty()) {
710 FILE* indexStream = fopen(indexFilename.c_str(), "rb");
714 cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl;
718 // build up index data structure
719 LoadIndexData(indexStream);
723 bool BamReader::Rewind(void) {
725 // find first reference that has alignments in the BAM file
727 int refCount = m_references.size();
728 for ( ; refID < refCount; ++refID ) {
729 if ( m_references.at(refID).RefHasAlignments ) { break; }
732 // store default bounds for first alignment
733 m_currentRefID = refID;
735 m_isRegionSpecified = false;
737 // return success/failure of seek
738 return BgzfSeek(m_alignmentsBeginOffset);