-// BamReader.cpp
-
-// Derek Barnett
-// Marth Lab, Boston College
-// Last modified: 23 April 2009
+// ***************************************************************************
+// BamReader (c) 2009 Derek Barnett
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
#include "BamReader.h"
+//#include "STLUtilities.h"
+
+#include <cstring>
+
#include <iostream>
using std::cerr;
using std::endl;
const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
-BamReader::BamReader(const char* filename, const char* indexFilename)
- : m_filename( (char*)filename )
- , m_indexFilename( (char*)indexFilename )
- , m_file(0)
- , m_index(NULL)
- , m_headerText("")
- , m_isOpen(false)
+// constructor
+BamReader::BamReader(void)
+ : m_index(NULL)
, m_isIndexLoaded(false)
+ , m_alignmentsBeginOffset(0)
, m_isRegionSpecified(false)
- , m_isCalculateAlignedBases(true)
, m_currentRefID(0)
, m_currentLeft(0)
- , m_alignmentsBeginOffset(0)
-{
- Open();
-}
+{ }
+// destructor
BamReader::~BamReader(void) {
-
- // close file
- if ( m_isOpen ) { Close(); }
+ m_headerText.clear();
+ ClearIndex();
+ Close();
}
-// open BAM file
-bool BamReader::Open(void) {
-
- if (!m_isOpen && m_filename != NULL ) {
-
- // open file
- m_file = bam_open(m_filename, "r");
-
- // get header info && index info
- if ( (m_file != NULL) && LoadHeader() ) {
-
- // save file offset where alignments start
- m_alignmentsBeginOffset = bam_tell(m_file);
-
- // set open flag
- m_isOpen = true;
- }
-
- // try to open (and load) index data, if index file given
- if ( m_indexFilename != NULL ) {
- OpenIndex();
- }
- }
-
- return m_isOpen;
+// checks BGZF block header
+bool BamReader::BgzfCheckBlockHeader(char* header) {
+
+ return (header[0] == GZIP_ID1 &&
+ header[1] == (char)GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & FLG_FEXTRA) != 0 &&
+ BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
+ header[12] == BGZF_ID1 &&
+ header[13] == BGZF_ID2 &&
+ BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN);
}
-bool BamReader::OpenIndex(void) {
-
- if ( m_indexFilename && !m_isIndexLoaded ) {
- m_isIndexLoaded = LoadIndex();
- }
- return m_isIndexLoaded;
+// closes the BAM file
+void BamReader::BgzfClose(void) {
+ m_BGZF.IsOpen = false;
+ fclose(m_BGZF.Stream);
}
-// close BAM file
-bool BamReader::Close(void) {
+// de-compresses the current block
+int BamReader::BgzfInflateBlock(int blockLength) {
- if (m_isOpen) {
+ // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)m_BGZF.CompressedBlock + 18;
+ zs.avail_in = blockLength - 16;
+ zs.next_out = (Bytef*)m_BGZF.UncompressedBlock;
+ zs.avail_out = m_BGZF.UncompressedBlockSize;
+
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ if (status != Z_OK) {
+ printf("inflateInit failed\n");
+ exit(1);
+ }
- // close file
- int ret = bam_close(m_file);
-
- // delete index info
- if ( m_index != NULL) { delete m_index; }
+ status = inflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ inflateEnd(&zs);
+ printf("inflate failed\n");
+ exit(1);
+ }
- // clear open flag
- m_isOpen = false;
+ status = inflateEnd(&zs);
+ if (status != Z_OK) {
+ printf("inflateEnd failed\n");
+ exit(1);
+ }
- // clear index flag
- m_isIndexLoaded = false;
+ return zs.total_out;
+}
- // clear region flag
- m_isRegionSpecified = false;
+// opens the BAM file for reading
+void BamReader::BgzfOpen(const string& filename) {
- // return success/fail of bam_close
- return (ret == 0);
- }
+ m_BGZF.Stream = fopen(filename.c_str(), "rb");
+ if(!m_BGZF.Stream) {
+ cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl;
+ exit(1);
+ }
- return true;
+ m_BGZF.IsOpen = true;
}
-// get BAM filename
-const char* BamReader::Filename(void) const {
- return (const char*)m_filename;
-}
+// reads BGZF data into buffer
+unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
-// set BAM filename
-void BamReader::SetFilename(const char* filename) {
- m_filename = (char*)filename;
-}
+ if (dataLength == 0) { return 0; }
-// get BAM Index filename
-const char* BamReader::IndexFilename(void) const {
- return (const char*)m_indexFilename;
-}
+ char* output = data;
+ unsigned int numBytesRead = 0;
+ while (numBytesRead < dataLength) {
-// set BAM Index filename
-void BamReader::SetIndexFilename(const char* indexFilename) {
- m_indexFilename = (char*)indexFilename;
-}
+ int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
+ if (bytesAvailable <= 0) {
+ if ( BgzfReadBlock() != 0 ) { return -1; }
+ bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
+ if ( bytesAvailable <= 0 ) { break; }
+ }
-// return full header text
-const string BamReader::GetHeaderText(void) const {
- return m_headerText;
-}
+ char* buffer = m_BGZF.UncompressedBlock;
+ int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
+ memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
-// return number of reference sequences in BAM file
-const int BamReader::GetReferenceCount(void) const {
- return m_references.size();
-}
+ m_BGZF.BlockOffset += copyLength;
+ output += copyLength;
+ numBytesRead += copyLength;
+ }
-// get RefID from reference name
-const int BamReader::GetRefID(string refName) const {
-
- vector<string> refNames;
- RefVector::const_iterator refIter = m_references.begin();
- RefVector::const_iterator refEnd = m_references.end();
- for ( ; refIter != refEnd; ++refIter) {
- refNames.push_back( (*refIter).RefName );
+ if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
+ m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);
+ m_BGZF.BlockOffset = 0;
+ m_BGZF.BlockLength = 0;
}
- // return 'index-of' refName (if not found, returns refNames.size())
- return Index( refNames.begin(), refNames.end(), refName );
+ return numBytesRead;
}
-const RefVector BamReader::GetReferenceData(void) const {
- return m_references;
-}
+int BamReader::BgzfReadBlock(void) {
-bool BamReader::Jump(int refID, unsigned int left) {
+ char header[BLOCK_HEADER_LENGTH];
+ int64_t blockAddress = ftello(m_BGZF.Stream);
- // if index available, and region is valid
- if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
- m_currentRefID = refID;
- m_currentLeft = left;
- m_isRegionSpecified = true;
-
- int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
- if ( offset == -1 ) { return false; }
- else { return ( bam_seek(m_file, offset, SEEK_SET) == 0 ); }
- }
- return false;
-}
+ int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
+ if (count == 0) {
+ m_BGZF.BlockLength = 0;
+ return 0;
+ }
-bool BamReader::Rewind(void) {
+ if (count != sizeof(header)) {
+ printf("read block failed - count != sizeof(header)\n");
+ return -1;
+ }
- int refID = 0;
- int refCount = m_references.size();
- for ( ; refID < refCount; ++refID ) {
- if ( m_references.at(refID).RefHasAlignments ) { break; }
- }
+ if (!BgzfCheckBlockHeader(header)) {
+ printf("read block failed - CheckBgzfBlockHeader() returned false\n");
+ return -1;
+ }
- m_currentRefID = refID;
- m_currentLeft = 0;
- m_isRegionSpecified = false;
+ int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
+ char* compressedBlock = m_BGZF.CompressedBlock;
+ memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
+ int remaining = blockLength - BLOCK_HEADER_LENGTH;
- return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
-}
+ count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
+ if (count != remaining) {
+ printf("read block failed - count != remaining\n");
+ return -1;
+ }
-// get next alignment from specified region
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
+ count = BgzfInflateBlock(blockLength);
+ if (count < 0) { return -1; }
- // try to load 'next' read
- if ( LoadNextAlignment(bAlignment) ) {
+ if (m_BGZF.BlockLength != 0) {
+ m_BGZF.BlockOffset = 0;
+ }
- // if specified region, check for region overlap
- if ( m_isRegionSpecified ) {
+ m_BGZF.BlockAddress = blockAddress;
+ m_BGZF.BlockLength = count;
+ return 0;
+}
- // if overlap, return true
- if ( IsOverlap(bAlignment) ) { return true; }
- // if not, try the next alignment
- else { return GetNextAlignment(bAlignment); }
- }
+// move file pointer to specified offset
+bool BamReader::BgzfSeek(int64_t position) {
- // not using region, valid read detected, return success
- else { return true; }
- }
+ int blockOffset = (position & 0xFFFF);
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+ if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
+ printf("ERROR: Unable to seek in BAM file\n");
+ exit(1);
+ }
- // no valid alignment to load
- return false;
+ m_BGZF.BlockLength = 0;
+ m_BGZF.BlockAddress = blockAddress;
+ m_BGZF.BlockOffset = blockOffset;
+ return true;
}
-void BamReader::SetCalculateAlignedBases(bool flag) {
- m_isCalculateAlignedBases = flag;
+// get file position in BAM file
+int64_t BamReader::BgzfTell(void) {
+ return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
}
int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
return i;
}
-uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
+unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
// initialize alignment end to starting position
- uint32_t alignEnd = position;
+ unsigned int alignEnd = position;
// iterate over cigar operations
vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
for ( ; cigarIter != cigarEnd; ++cigarIter) {
- if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
+ char cigarType = (*cigarIter).Type;
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
alignEnd += (*cigarIter).Length;
}
}
return alignEnd;
}
-int64_t BamReader::GetOffset(int refID, unsigned int left) {
+void BamReader::ClearIndex(void) {
+
+ if ( m_index ) {
+ // iterate over references
+ vector<RefIndex*>::iterator refIter = m_index->begin();
+ vector<RefIndex*>::iterator refEnd = m_index->end();
+ for ( ; refIter != refEnd; ++refIter) {
+ RefIndex* aRef = (*refIter);
+ if ( aRef ) {
+ // clear out BAM bins
+ if ( aRef->first ) {
+ BinVector::iterator binIter = (aRef->first)->begin();
+ BinVector::iterator binEnd = (aRef->first)->end();
+ for ( ; binIter != binEnd; ++binIter ) {
+ ChunkVector* chunks = (*binIter).second;
+ if ( chunks ) { delete chunks; }
+ }
+ delete aRef->first;
+ }
+ // clear BAM linear offsets
+ if ( aRef->second ) { delete aRef->second; }
+ delete aRef;
+ }
+ }
+ delete m_index;
+ }
+}
- // make space for bins
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
-
- // returns number of bins overlapping (left, right)
- // stores indices of those bins in 'bins'
- int numBins = BinsFromRegion(refID, left, bins);
+// closes the BAM file
+void BamReader::Close(void) {
+ if(m_BGZF.IsOpen) { BgzfClose(); }
+ if(m_index) { delete m_index; m_index = NULL; }
+ m_isRegionSpecified = false;
+}
- // access index data for refID
- RefIndex* refIndex = m_index->at(refID);
+const string BamReader::GetHeaderText(void) const {
+ return m_headerText;
+}
- // get list of BAM bins for this reference sequence
- BinVector* refBins = refIndex->first;
+const int BamReader::GetReferenceCount(void) const {
+ return m_references.size();
+}
- sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+const RefVector BamReader::GetReferenceData(void) const {
+ return m_references;
+}
- // declare ChunkVector
- ChunkVector regionChunks;
+const int BamReader::GetReferenceID(const string& refName) const {
- // declaure LinearOffsetVector
- LinearOffsetVector* linearOffsets = refIndex->second;
+ // retrieve names from reference data
+ vector<string> refNames;
+ RefVector::const_iterator refIter = m_references.begin();
+ RefVector::const_iterator refEnd = m_references.end();
+ for ( ; refIter != refEnd; ++refIter) {
+ refNames.push_back( (*refIter).RefName );
+ }
- // some sort of linear offset vs bin index voodoo... not completely sure what's going here
- uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+ // return 'index-of' refName ( if not found, returns refNames.size() )
+ return Index( refNames.begin(), refNames.end(), refName );
+}
- BinVector::iterator binBegin = refBins->begin();
- BinVector::iterator binEnd = refBins->end();
+// get next alignment (from specified region, if given)
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
- // iterate over bins overlapping region, count chunks
- for (int i = 0; i < numBins; ++i) {
+ // if valid alignment available
+ if ( LoadNextAlignment(bAlignment) ) {
- // look for bin with ID=bin[i]
- BinVector::iterator binIter = binBegin;
-
- for ( ; binIter != binEnd; ++binIter ) {
+ // if region not specified, return success
+ if ( !m_isRegionSpecified ) { return true; }
- // if found, increment n_off by number of chunks for each bin
- if ( (*binIter).first == (uint32_t)bins[i] ) {
-
- // iterate over chunks in that bin
- ChunkVector* binChunks = (*binIter).second;
-
- ChunkVector::iterator chunkIter = binChunks->begin();
- ChunkVector::iterator chunkEnd = binChunks->end();
- for ( ; chunkIter != chunkEnd; ++chunkIter) {
-
- // if right bound of pair is greater than min_off (linear offset value), store pair
- if ( (*chunkIter).second > minOffset) {
- regionChunks.push_back( (*chunkIter) );
- }
- }
- }
+ // load next alignment until region overlap is found
+ while ( !IsOverlap(bAlignment) ) {
+ // if no valid alignment available (likely EOF) return failure
+ if ( !LoadNextAlignment(bAlignment) ) { return false; }
}
- }
+
+ // return success (alignment found that overlaps region)
+ return true;
+ }
+
+ // no valid alignment
+ else { return false; }
+}
- // clean up temp array
- free(bins);
+int64_t BamReader::GetOffset(int refID, unsigned int left) {
- // there should be at least 1
- if(regionChunks.size() == 0) { return -1; }
+ // calculate which bins overlap this region
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+ int numBins = BinsFromRegion(refID, left, bins);
- // sort chunks by start position
- sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
+ // get bins for this reference
+ RefIndex* refIndex = m_index->at(refID);
+ BinVector* refBins = refIndex->first;
- // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
- int numOffsets = regionChunks.size();
- for (int i = 1; i < numOffsets; ++i) {
- if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
- regionChunks.at(i-1).second = regionChunks.at(i).first;
- }
- }
+ // get minimum offset to consider
+ LinearOffsetVector* linearOffsets = refIndex->second;
+ uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+
+ // store offsets to beginning of alignment 'chunks'
+ std::vector<int64_t> chunkStarts;
- // merge adjacent chunks
- int l = 0;
- for (int i = 1; i < numOffsets; ++i) {
- // if adjacent, expand boundaries of (merged) chunk
- if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
- regionChunks.at(l).second = regionChunks.at(i).second;
+ // reference bin iterators
+ BinVector::const_iterator binIter;
+ BinVector::const_iterator binBegin = refBins->begin();
+ BinVector::const_iterator binEnd = refBins->end();
+
+ // store all alignment 'chunk' starts for bins in this region
+ for (int i = 0; i < numBins; ++i ) {
+ binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
+ if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
+ ChunkVector* binChunks = (*binIter).second;
+ ChunkVector::const_iterator chunkIter = binChunks->begin();
+ ChunkVector::const_iterator chunkEnd = binChunks->end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter) {
+ if ( (*chunkIter).second > minOffset ) {
+ chunkStarts.push_back( (*chunkIter).first );
+ }
+ }
}
- // else, move on to next (merged) chunk index
- else { regionChunks.at(++l) = regionChunks.at(i); }
}
-
- // return beginning file offset of first chunk for region
- return (int64_t)regionChunks.at(0).first;
+
+ // clean up memory
+ free(bins);
+
+ // if no alignments found
+ if ( chunkStarts.empty() ) { return -1; }
+
+ // else return smallest offset for alignment starts
+ else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
}
bool BamReader::IsOverlap(BamAlignment& bAlignment) {
// read starts after left boundary
if ( bAlignment.Position >= m_currentLeft) { return true; }
- // get alignment end
- uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
-
// return whether alignment end overlaps left boundary
- return ( alignEnd >= m_currentLeft );
+ return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
}
-bool BamReader::LoadHeader(void) {
+bool BamReader::Jump(int refID, unsigned int left) {
+
+ // if index available, and region is valid
+ if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
+ m_currentRefID = refID;
+ m_currentLeft = left;
+ m_isRegionSpecified = true;
+
+ int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
+ if ( offset == -1 ) { return false; }
+ else { return BgzfSeek(offset); }
+ }
+ return false;
+}
+void BamReader::LoadHeaderData(void) {
+
// check to see if proper BAM header
- char buf[4];
- if (bam_read(m_file, buf, 4) != 4) { return false; }
- if (strncmp(buf, "BAM\001", 4)) {
- fprintf(stderr, "wrong header type!\n");
- return false;
+ char buffer[4];
+ if (BgzfRead(buffer, 4) != 4) {
+ printf("Could not read header type\n");
+ exit(1);
+ }
+ if (strncmp(buffer, "BAM\001", 4)) {
+ printf("wrong header type!\n");
+ exit(1);
}
// get BAM header text length
- int32_t headerTextLength;
- bam_read(m_file, &headerTextLength, 4);
+ BgzfRead(buffer, 4);
+ const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
// get BAM header text
char* headerText = (char*)calloc(headerTextLength + 1, 1);
- bam_read(m_file, headerText, headerTextLength);
+ BgzfRead(headerText, headerTextLength);
m_headerText = (string)((const char*)headerText);
// clean up calloc-ed temp variable
free(headerText);
-
- // get number of reference sequences
- int32_t numberRefSeqs;
- bam_read(m_file, &numberRefSeqs, 4);
- if (numberRefSeqs == 0) { return false; }
-
- m_references.reserve((int)numberRefSeqs);
-
- // reference variables
- int32_t refNameLength;
- char* refName;
- uint32_t refLength;
-
- // iterate over all references in header
- for (int i = 0; i != numberRefSeqs; ++i) {
-
- // get length of reference name
- bam_read(m_file, &refNameLength, 4);
- refName = (char*)calloc(refNameLength, 1);
-
- // get reference name and reference sequence length
- bam_read(m_file, refName, refNameLength);
- bam_read(m_file, &refLength, 4);
-
- // store data for reference
- RefData aReference;
- aReference.RefName = (string)((const char*)refName);
- aReference.RefLength = refLength;
- m_references.push_back(aReference);
-
- // clean up calloc-ed temp variable
- free(refName);
- }
-
- return true;
}
-bool BamReader::LoadIndex(void) {
-
- // check to see if index file exists
- FILE* indexFile;
- if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
- fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
- return false;
- }
+void BamReader::LoadIndexData(FILE* indexStream) {
// see if index is valid BAM index
char magic[4];
- fread(magic, 1, 4, indexFile);
+ fread(magic, 1, 4, indexStream);
if (strncmp(magic, "BAI\1", 4)) {
- fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
- fclose(indexFile);
- return false;
+ printf("Problem with index file - invalid format.\n");
+ fclose(indexStream);
+ exit(1);
}
// get number of reference sequences
uint32_t numRefSeqs;
- fread(&numRefSeqs, 4, 1, indexFile);
+ fread(&numRefSeqs, 4, 1, indexStream);
// intialize BamIndex data structure
m_index = new BamIndex;
// get number of bins for this reference sequence
int32_t numBins;
- fread(&numBins, 4, 1, indexFile);
+ fread(&numBins, 4, 1, indexStream);
if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
// get binID
uint32_t binID;
- fread(&binID, 4, 1, indexFile);
+ fread(&binID, 4, 1, indexStream);
// get number of regionChunks in this bin
uint32_t numChunks;
- fread(&numChunks, 4, 1, indexFile);
+ fread(&numChunks, 4, 1, indexStream);
// intialize ChunkVector
ChunkVector* regionChunks = new ChunkVector;
// get chunk boundaries (left, right)
uint64_t left;
uint64_t right;
- fread(&left, 8, 1, indexFile);
- fread(&right, 8, 1, indexFile);
+ fread(&left, 8, 1, indexStream);
+ fread(&right, 8, 1, indexStream);
// save ChunkPair
regionChunks->push_back( ChunkPair(left, right) );
}
+ // sort chunks for this bin
+ sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
+
// save binID, chunkVector for this bin
bins->push_back( BamBin(binID, regionChunks) );
}
+ // sort bins by binID
+ sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+
// load linear index for this reference sequence
// get number of linear offsets
int32_t numLinearOffsets;
- fread(&numLinearOffsets, 4, 1, indexFile);
+ fread(&numLinearOffsets, 4, 1, indexStream);
// intialize LinearOffsetVector
LinearOffsetVector* linearOffsets = new LinearOffsetVector;
for (int j = 0; j < numLinearOffsets; ++j) {
// get a linear offset
uint64_t linearOffset;
- fread(&linearOffset, 8, 1, indexFile);
+ fread(&linearOffset, 8, 1, indexStream);
// store linear offset
linearOffsets->push_back(linearOffset);
}
+ // sort linear offsets
+ sort( linearOffsets->begin(), linearOffsets->end() );
+
// store index data for that reference sequence
m_index->push_back( new RefIndex(bins, linearOffsets) );
}
// close index file (.bai) and return
- fclose(indexFile);
- return true;
+ fclose(indexStream);
}
bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
- // check valid alignment block header data
- int32_t block_len;
- int32_t ret;
- uint32_t x[8];
-
- int32_t bytesRead = 0;
-
// read in the 'block length' value, make sure it's not zero
- if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; }
- bytesRead += 4;
+ char buffer[4];
+ BgzfRead(buffer, 4);
+ const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
+ if ( blockLength == 0 ) { return false; }
+
+ // keep track of bytes read as method progresses
+ int bytesRead = 4;
- // read in core alignment data, make the right size of data was read
- if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
+ // read in core alignment data, make sure the right size of data was read
+ char x[BAM_CORE_SIZE];
+ if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
bytesRead += BAM_CORE_SIZE;
- // set BamAlignment 'core' data
- bAlignment.RefID = x[0];
- bAlignment.Position = x[1];
- bAlignment.Bin = x[2]>>16;
- bAlignment.MapQuality = x[2]>>8&0xff;
- bAlignment.AlignmentFlag = x[3]>>16;
- bAlignment.MateRefID = x[5];
- bAlignment.MatePosition = x[6];
- bAlignment.InsertSize = x[7];
-
- // fetch & store often-used lengths for character data parsing
- unsigned int queryNameLength = x[2]&0xff;
- unsigned int numCigarOperations = x[3]&0xffff;
- unsigned int querySequenceLength = x[4];
+ // set BamAlignment 'core' data and character data lengths
+ unsigned int tempValue;
+ unsigned int queryNameLength;
+ unsigned int numCigarOperations;
+ unsigned int querySequenceLength;
+
+ bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]);
+ bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
+
+ tempValue = BgzfUnpackUnsignedInt(&x[8]);
+ bAlignment.Bin = tempValue >> 16;
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;
+ queryNameLength = tempValue & 0xff;
+
+ tempValue = BgzfUnpackUnsignedInt(&x[12]);
+ bAlignment.AlignmentFlag = tempValue >> 16;
+ numCigarOperations = tempValue & 0xffff;
+
+ querySequenceLength = BgzfUnpackUnsignedInt(&x[16]);
+ bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]);
+ bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
+ bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]);
+
+ // calculate lengths/offsets
+ const unsigned int dataLength = blockLength - BAM_CORE_SIZE;
+ const unsigned int cigarDataOffset = queryNameLength;
+ const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);
+ const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;
+ const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;
+ const unsigned int tagDataLen = dataLength - tagDataOffset;
- // get length of character data
- int dataLength = block_len - BAM_CORE_SIZE;
-
// set up destination buffers for character data
- uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
- uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength);
-
- const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
- const unsigned int tagDataLen = dataLength - tagDataOffset;
- char* tagData = ((char*)allCharData) + tagDataOffset;
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
+ char* seqData = ((char*)allCharData) + seqDataOffset;
+ char* qualData = ((char*)allCharData) + qualDataOffset;
+ char* tagData = ((char*)allCharData) + tagDataOffset;
// get character data - make sure proper data size was read
- if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
+ if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
else {
-
+
bytesRead += dataLength;
-
- // clear out bases, qualities, aligned bases, CIGAR, and tag data
+
+ // clear out any previous string data
+ bAlignment.Name.clear();
bAlignment.QueryBases.clear();
bAlignment.Qualities.clear();
bAlignment.AlignedBases.clear();
bAlignment.CigarData.clear();
bAlignment.TagData.clear();
-
+
// save name
bAlignment.Name = (string)((const char*)(allCharData));
- // save bases
- char singleBase[2];
- uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
- for (unsigned int i = 0; i < querySequenceLength; ++i) {
- // numeric to char conversion
- sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
- // append character data to Bases
- bAlignment.QueryBases.append( (const char*)singleBase );
+ // save query sequence
+ for (unsigned int i = 0; i < querySequenceLength; ++i) {
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+ bAlignment.QueryBases.append( 1, singleBase );
}
// save sequence length
bAlignment.Length = bAlignment.QueryBases.length();
// save qualities
- char singleQuality[4];
- uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
for (unsigned int i = 0; i < querySequenceLength; ++i) {
- // numeric to char conversion
- sprintf( singleQuality, "%c", ( t[i]+33 ) );
- // append character data to Qualities
- bAlignment.Qualities.append( (const char*)singleQuality );
+ char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character
+ bAlignment.Qualities.append( 1, singleQuality );
}
// save CIGAR-related data;
int k = 0;
for (unsigned int i = 0; i < numCigarOperations; ++i) {
-
+
// build CigarOp struct
CigarOp op;
op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
-
+
// save CigarOp
bAlignment.CigarData.push_back(op);
-
- // can skip this step if user wants to ignore
- if (m_isCalculateAlignedBases) {
-
- // build AlignedBases string
- switch (op.Type) {
-
- case ('M') :
- case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
- case ('S') : k += op.Length; // for 'S' - skip over query bases
- break;
-
- case ('D') :
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
- break;
-
- case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
- k += op.Length;
- break;
-
- case ('H') : break; // for 'H' - do nothing, move to next op
-
- default : assert(false); // shouldn't get here
- break;
- }
+
+ // build AlignedBases string
+ switch (op.Type) {
+
+ case ('M') :
+ case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
+ case ('S') : k += op.Length; // for 'S' - skip over query bases
+ break;
+
+ case ('D') :
+ case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
+ break;
+
+ case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
+ k += op.Length;
+ break;
+
+ case ('H') : break; // for 'H' - do nothing, move to next op
+
+ default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
+ exit(1);
}
}
-
+
// read in the tag data
bAlignment.TagData.resize(tagDataLen);
memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
}
+
free(allCharData);
+ return true;
+}
-/*
- // (optional) read tag parsing data
- string tag;
- char data;
- int i = 0;
+void BamReader::LoadReferenceData(void) {
- // still data to read (auxiliary tags)
- while ( bytesRead < block_len ) {
+ // get number of reference sequences
+ char buffer[4];
+ BgzfRead(buffer, 4);
+ const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
+ if (numberRefSeqs == 0) { return; }
+ m_references.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {
- if ( bam_read(m_file, &data, 1) == 1 ) {
-
- ++bytesRead;
+ // get length of reference name
+ BgzfRead(buffer, 4);
+ const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
+ char* refName = (char*)calloc(refNameLength, 1);
+
+ // get reference name and reference sequence length
+ BgzfRead(refName, refNameLength);
+ BgzfRead(buffer, 4);
+ const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName);
+ aReference.RefLength = refLength;
+ m_references.push_back(aReference);
+
+ // clean up calloc-ed temp variable
+ free(refName);
+ }
+}
- if (bytesRead == block_len && data != '\0') {
- fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
- exit(1);
- }
+// opens BAM file (and index)
+void BamReader::Open(const string& filename, const string& indexFilename) {
- tag.append(1, data);
- if ( data == '\0' ) {
- bAlignment.Tags.push_back(tag);
- tag = "";
- i = 0;
- } else {
- if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
- ++i;
- }
- } else {
- fprintf(stderr, "ERROR: Could not read tag info.\n");
+ // open the BGZF file for reading, retrieve header text & reference data
+ BgzfOpen(filename);
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store file offset of first alignment
+ m_alignmentsBeginOffset = BgzfTell();
+
+ // open index file & load index data (if exists)
+ OpenIndex(indexFilename);
+}
+
+void BamReader::OpenIndex(const string& indexFilename) {
+
+ // if index file exists
+ if (!indexFilename.empty()) {
+
+ // open index
+ FILE* indexStream = fopen(indexFilename.c_str(), "rb");
+
+ // abort on error
+ if(!indexStream) {
+ cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl;
exit(1);
}
+
+ // build up index data structure
+ LoadIndexData(indexStream);
}
-*/
- return true;
}
+
+bool BamReader::Rewind(void) {
+
+ // find first reference that has alignments in the BAM file
+ int refID = 0;
+ int refCount = m_references.size();
+ for ( ; refID < refCount; ++refID ) {
+ if ( m_references.at(refID).RefHasAlignments ) { break; }
+ }
+
+ // store default bounds for first alignment
+ m_currentRefID = refID;
+ m_currentLeft = 0;
+ m_isRegionSpecified = false;
+
+ // return success/failure of seek
+ return BgzfSeek(m_alignmentsBeginOffset);
+}