1 // ***************************************************************************
2 // BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 15 July 2009 (DB)
7 // ---------------------------------------------------------------------------
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for reading BAM files
12 // ***************************************************************************
15 #include "BamReader.h"
16 using namespace BamTools;
19 // static character constants
20 const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
21 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
24 BamReader::BamReader(void)
26 , m_isIndexLoaded(false)
27 , m_alignmentsBeginOffset(0)
28 , m_isRegionSpecified(false)
34 BamReader::~BamReader(void) {
38 // checks BGZF block header
39 bool BamReader::BgzfCheckBlockHeader(char* header) {
41 return (header[0] == GZIP_ID1 &&
42 header[1] == (char)GZIP_ID2 &&
43 header[2] == Z_DEFLATED &&
44 (header[3] & FLG_FEXTRA) != 0 &&
45 BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
46 header[12] == BGZF_ID1 &&
47 header[13] == BGZF_ID2 &&
48 BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
52 // closes the BAM file
53 void BamReader::BgzfClose(void) {
54 m_BGZF.IsOpen = false;
55 fclose(m_BGZF.Stream);
58 // de-compresses the current block
59 int BamReader::BgzfInflateBlock(int blockLength) {
61 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
65 zs.next_in = (Bytef*)m_BGZF.CompressedBlock + 18;
66 zs.avail_in = blockLength - 16;
67 zs.next_out = (Bytef*)m_BGZF.UncompressedBlock;
68 zs.avail_out = m_BGZF.UncompressedBlockSize;
70 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
72 printf("inflateInit failed\n");
76 status = inflate(&zs, Z_FINISH);
77 if (status != Z_STREAM_END) {
79 printf("inflate failed\n");
83 status = inflateEnd(&zs);
85 printf("inflateEnd failed\n");
92 // opens the BAM file for reading
93 void BamReader::BgzfOpen(const string& filename) {
95 m_BGZF.Stream = fopen(filename.c_str(), "rb");
97 printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
101 m_BGZF.IsOpen = true;
104 // reads BGZF data into buffer
105 unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
107 if (dataLength == 0) { return 0; }
110 unsigned int numBytesRead = 0;
111 while (numBytesRead < dataLength) {
113 int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
114 if (bytesAvailable <= 0) {
115 if ( BgzfReadBlock() != 0 ) { return -1; }
116 bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
117 if ( bytesAvailable <= 0 ) { break; }
120 char* buffer = m_BGZF.UncompressedBlock;
121 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
122 memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
124 m_BGZF.BlockOffset += copyLength;
125 output += copyLength;
126 numBytesRead += copyLength;
129 if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
130 m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);
131 m_BGZF.BlockOffset = 0;
132 m_BGZF.BlockLength = 0;
138 int BamReader::BgzfReadBlock(void) {
140 char header[BLOCK_HEADER_LENGTH];
141 int64_t blockAddress = ftello(m_BGZF.Stream);
143 int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
145 m_BGZF.BlockLength = 0;
149 if (count != sizeof(header)) {
150 printf("read block failed - count != sizeof(header)\n");
154 if (!BgzfCheckBlockHeader(header)) {
155 printf("read block failed - CheckBgzfBlockHeader() returned false\n");
159 int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
160 char* compressedBlock = m_BGZF.CompressedBlock;
161 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
162 int remaining = blockLength - BLOCK_HEADER_LENGTH;
164 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
165 if (count != remaining) {
166 printf("read block failed - count != remaining\n");
170 count = BgzfInflateBlock(blockLength);
171 if (count < 0) { return -1; }
173 if (m_BGZF.BlockLength != 0) {
174 m_BGZF.BlockOffset = 0;
177 m_BGZF.BlockAddress = blockAddress;
178 m_BGZF.BlockLength = count;
182 // move file pointer to specified offset
183 bool BamReader::BgzfSeek(int64_t position) {
185 int blockOffset = (position & 0xFFFF);
186 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
187 if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
188 printf("ERROR: Unable to seek in BAM file\n");
192 m_BGZF.BlockLength = 0;
193 m_BGZF.BlockAddress = blockAddress;
194 m_BGZF.BlockOffset = blockOffset;
198 // get file position in BAM file
199 int64_t BamReader::BgzfTell(void) {
200 return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
203 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
205 // get region boundaries
206 uint32_t begin = left;
207 uint32_t end = m_references.at(refID).RefLength - 1;
209 // initialize list, bin '0' always a valid bin
213 // get rest of bins that contain this region
215 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
216 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
217 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
218 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
219 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
221 // return number of bins stored
225 unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
227 // initialize alignment end to starting position
228 unsigned int alignEnd = position;
230 // iterate over cigar operations
231 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
232 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
233 for ( ; cigarIter != cigarEnd; ++cigarIter) {
234 char cigarType = (*cigarIter).Type;
235 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
236 alignEnd += (*cigarIter).Length;
242 void BamReader::ClearIndex(void) {
245 // iterate over references
246 vector<RefIndex*>::iterator refIter = m_index->begin();
247 vector<RefIndex*>::iterator refEnd = m_index->end();
248 for ( ; refIter != refEnd; ++refIter) {
249 RefIndex* aRef = (*refIter);
251 // clear out BAM bins
253 BinVector::iterator binIter = (aRef->first)->begin();
254 BinVector::iterator binEnd = (aRef->first)->end();
255 for ( ; binIter != binEnd; ++binIter ) {
256 ChunkVector* chunks = (*binIter).second;
257 if ( chunks ) { delete chunks; chunks = NULL;}
262 // clear BAM linear offsets
263 if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
273 // closes the BAM file
274 void BamReader::Close(void) {
275 if(m_BGZF.IsOpen) { BgzfClose(); }
277 m_headerText.clear();
278 m_isRegionSpecified = false;
281 const string BamReader::GetHeaderText(void) const {
285 const int BamReader::GetReferenceCount(void) const {
286 return m_references.size();
289 const RefVector BamReader::GetReferenceData(void) const {
293 const int BamReader::GetReferenceID(const string& refName) const {
295 // retrieve names from reference data
296 vector<string> refNames;
297 RefVector::const_iterator refIter = m_references.begin();
298 RefVector::const_iterator refEnd = m_references.end();
299 for ( ; refIter != refEnd; ++refIter) {
300 refNames.push_back( (*refIter).RefName );
303 // return 'index-of' refName ( if not found, returns refNames.size() )
304 return Index( refNames.begin(), refNames.end(), refName );
307 // get next alignment (from specified region, if given)
308 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
310 // if valid alignment available
311 if ( LoadNextAlignment(bAlignment) ) {
313 // if region not specified, return success
314 if ( !m_isRegionSpecified ) { return true; }
316 // load next alignment until region overlap is found
317 while ( !IsOverlap(bAlignment) ) {
318 // if no valid alignment available (likely EOF) return failure
319 if ( !LoadNextAlignment(bAlignment) ) { return false; }
322 // return success (alignment found that overlaps region)
326 // no valid alignment
327 else { return false; }
330 int64_t BamReader::GetOffset(int refID, unsigned int left) {
332 // calculate which bins overlap this region
333 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
334 int numBins = BinsFromRegion(refID, left, bins);
336 // get bins for this reference
337 RefIndex* refIndex = m_index->at(refID);
338 BinVector* refBins = refIndex->first;
340 // get minimum offset to consider
341 LinearOffsetVector* linearOffsets = refIndex->second;
342 uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
344 // store offsets to beginning of alignment 'chunks'
345 std::vector<int64_t> chunkStarts;
347 // reference bin iterators
348 BinVector::const_iterator binIter;
349 BinVector::const_iterator binBegin = refBins->begin();
350 BinVector::const_iterator binEnd = refBins->end();
352 // store all alignment 'chunk' starts for bins in this region
353 for (int i = 0; i < numBins; ++i ) {
354 binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
355 if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
356 ChunkVector* binChunks = (*binIter).second;
357 ChunkVector::const_iterator chunkIter = binChunks->begin();
358 ChunkVector::const_iterator chunkEnd = binChunks->end();
359 for ( ; chunkIter != chunkEnd; ++chunkIter) {
360 if ( (*chunkIter).second > minOffset ) {
361 chunkStarts.push_back( (*chunkIter).first );
370 // if no alignments found
371 if ( chunkStarts.empty() ) { return -1; }
373 // else return smallest offset for alignment starts
374 else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
377 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
379 // if on different reference sequence, quit
380 if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
382 // read starts after left boundary
383 if ( bAlignment.Position >= m_currentLeft) { return true; }
385 // return whether alignment end overlaps left boundary
386 return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
389 bool BamReader::Jump(int refID, unsigned int position) {
391 // if index available, and region is valid
392 if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) {
393 m_currentRefID = refID;
394 m_currentLeft = position;
395 m_isRegionSpecified = true;
397 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
398 if ( offset == -1 ) { return false; }
399 else { return BgzfSeek(offset); }
404 void BamReader::LoadHeaderData(void) {
406 // check to see if proper BAM header
408 if (BgzfRead(buffer, 4) != 4) {
409 printf("Could not read header type\n");
412 if (strncmp(buffer, "BAM\001", 4)) {
413 printf("wrong header type!\n");
417 // get BAM header text length
419 const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
421 // get BAM header text
422 char* headerText = (char*)calloc(headerTextLength + 1, 1);
423 BgzfRead(headerText, headerTextLength);
424 m_headerText = (string)((const char*)headerText);
426 // clean up calloc-ed temp variable
430 void BamReader::LoadIndexData(FILE* indexStream) {
432 // see if index is valid BAM index
434 fread(magic, 1, 4, indexStream);
435 if (strncmp(magic, "BAI\1", 4)) {
436 printf("Problem with index file - invalid format.\n");
441 // get number of reference sequences
443 fread(&numRefSeqs, 4, 1, indexStream);
445 // intialize BamIndex data structure
446 m_index = new BamIndex;
447 m_index->reserve(numRefSeqs);
449 // iterate over reference sequences
450 for (unsigned int i = 0; i < numRefSeqs; ++i) {
452 // get number of bins for this reference sequence
454 fread(&numBins, 4, 1, indexStream);
456 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
458 // intialize BinVector
459 BinVector* bins = new BinVector;
460 bins->reserve(numBins);
462 // iterate over bins for that reference sequence
463 for (int j = 0; j < numBins; ++j) {
467 fread(&binID, 4, 1, indexStream);
469 // get number of regionChunks in this bin
471 fread(&numChunks, 4, 1, indexStream);
473 // intialize ChunkVector
474 ChunkVector* regionChunks = new ChunkVector;
475 regionChunks->reserve(numChunks);
477 // iterate over regionChunks in this bin
478 for (unsigned int k = 0; k < numChunks; ++k) {
480 // get chunk boundaries (left, right)
483 fread(&left, 8, 1, indexStream);
484 fread(&right, 8, 1, indexStream);
487 regionChunks->push_back( ChunkPair(left, right) );
490 // sort chunks for this bin
491 sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
493 // save binID, chunkVector for this bin
494 bins->push_back( BamBin(binID, regionChunks) );
497 // sort bins by binID
498 sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
500 // load linear index for this reference sequence
502 // get number of linear offsets
503 int32_t numLinearOffsets;
504 fread(&numLinearOffsets, 4, 1, indexStream);
506 // intialize LinearOffsetVector
507 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
508 linearOffsets->reserve(numLinearOffsets);
510 // iterate over linear offsets for this reference sequeence
511 for (int j = 0; j < numLinearOffsets; ++j) {
512 // get a linear offset
513 uint64_t linearOffset;
514 fread(&linearOffset, 8, 1, indexStream);
515 // store linear offset
516 linearOffsets->push_back(linearOffset);
519 // sort linear offsets
520 sort( linearOffsets->begin(), linearOffsets->end() );
522 // store index data for that reference sequence
523 m_index->push_back( new RefIndex(bins, linearOffsets) );
526 // close index file (.bai) and return
530 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
532 // read in the 'block length' value, make sure it's not zero
535 const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
536 if ( blockLength == 0 ) { return false; }
538 // keep track of bytes read as method progresses
541 // read in core alignment data, make sure the right size of data was read
542 char x[BAM_CORE_SIZE];
543 if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
544 bytesRead += BAM_CORE_SIZE;
546 // set BamAlignment 'core' data and character data lengths
547 unsigned int tempValue;
548 unsigned int queryNameLength;
549 unsigned int numCigarOperations;
550 unsigned int querySequenceLength;
552 bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]);
553 bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
555 tempValue = BgzfUnpackUnsignedInt(&x[8]);
556 bAlignment.Bin = tempValue >> 16;
557 bAlignment.MapQuality = tempValue >> 8 & 0xff;
558 queryNameLength = tempValue & 0xff;
560 tempValue = BgzfUnpackUnsignedInt(&x[12]);
561 bAlignment.AlignmentFlag = tempValue >> 16;
562 numCigarOperations = tempValue & 0xffff;
564 querySequenceLength = BgzfUnpackUnsignedInt(&x[16]);
565 bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]);
566 bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
567 bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]);
569 // calculate lengths/offsets
570 const unsigned int dataLength = blockLength - BAM_CORE_SIZE;
571 const unsigned int cigarDataOffset = queryNameLength;
572 const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);
573 const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;
574 const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;
575 const unsigned int tagDataLen = dataLength - tagDataOffset;
577 // set up destination buffers for character data
578 char* allCharData = (char*)calloc(sizeof(char), dataLength);
579 uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
580 char* seqData = ((char*)allCharData) + seqDataOffset;
581 char* qualData = ((char*)allCharData) + qualDataOffset;
582 char* tagData = ((char*)allCharData) + tagDataOffset;
584 // get character data - make sure proper data size was read
585 if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
588 bytesRead += dataLength;
590 // clear out any previous string data
591 bAlignment.Name.clear();
592 bAlignment.QueryBases.clear();
593 bAlignment.Qualities.clear();
594 bAlignment.AlignedBases.clear();
595 bAlignment.CigarData.clear();
596 bAlignment.TagData.clear();
599 bAlignment.Name = (string)((const char*)(allCharData));
601 // save query sequence
602 for (unsigned int i = 0; i < querySequenceLength; ++i) {
603 char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
604 bAlignment.QueryBases.append( 1, singleBase );
607 // save sequence length
608 bAlignment.Length = bAlignment.QueryBases.length();
611 for (unsigned int i = 0; i < querySequenceLength; ++i) {
612 char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character
613 bAlignment.Qualities.append( 1, singleQuality );
616 // save CIGAR-related data;
618 for (unsigned int i = 0; i < numCigarOperations; ++i) {
620 // build CigarOp struct
622 op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
623 op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
626 bAlignment.CigarData.push_back(op);
628 // build AlignedBases string
632 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
633 case ('S') : k += op.Length; // for 'S' - skip over query bases
636 case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character
639 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;
642 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
646 case ('H') : break; // for 'H' - do nothing, move to next op
648 default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
653 // read in the tag data
654 bAlignment.TagData.resize(tagDataLen);
655 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
662 void BamReader::LoadReferenceData(void) {
664 // get number of reference sequences
667 const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
668 if (numberRefSeqs == 0) { return; }
669 m_references.reserve((int)numberRefSeqs);
671 // iterate over all references in header
672 for (unsigned int i = 0; i != numberRefSeqs; ++i) {
674 // get length of reference name
676 const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
677 char* refName = (char*)calloc(refNameLength, 1);
679 // get reference name and reference sequence length
680 BgzfRead(refName, refNameLength);
682 const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
684 // store data for reference
686 aReference.RefName = (string)((const char*)refName);
687 aReference.RefLength = refLength;
688 m_references.push_back(aReference);
690 // clean up calloc-ed temp variable
695 // opens BAM file (and index)
696 void BamReader::Open(const string& filename, const string& indexFilename) {
698 // open the BGZF file for reading, retrieve header text & reference data
703 // store file offset of first alignment
704 m_alignmentsBeginOffset = BgzfTell();
706 // open index file & load index data (if exists)
707 OpenIndex(indexFilename);
710 void BamReader::OpenIndex(const string& indexFilename) {
712 // if index file exists
713 if (!indexFilename.empty()) {
716 FILE* indexStream = fopen(indexFilename.c_str(), "rb");
720 printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
724 // build up index data structure
725 LoadIndexData(indexStream);
729 bool BamReader::Rewind(void) {
731 // find first reference that has alignments in the BAM file
733 int refCount = m_references.size();
734 for ( ; refID < refCount; ++refID ) {
735 if ( m_references.at(refID).RefHasAlignments ) { break; }
738 // store default bounds for first alignment
739 m_currentRefID = refID;
741 m_isRegionSpecified = false;
743 // return success/failure of seek
744 return BgzfSeek(m_alignmentsBeginOffset);