From: Derek Date: Fri, 9 Jul 2010 16:06:27 +0000 (-0400) Subject: Separated indexing ops to new class. Implemented a fixed-partition-size index format... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=9c0245facde1cd542ca59d66082b3525ad897af3;p=bamtools.git Separated indexing ops to new class. Implemented a fixed-partition-size index format (.bti) as an alternative to the BAM default (.bai) --- diff --git a/BamAux.h b/BamAux.h index c079e1d..8156596 100644 --- a/BamAux.h +++ b/BamAux.h @@ -228,46 +228,6 @@ struct BamRegion { { } }; -// ---------------------------------------------------------------- -// Indexing structs & typedefs - -struct Chunk { - - // data members - uint64_t Start; - uint64_t Stop; - - // constructor - Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) - { } -}; - -inline -bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { - return lhs.Start < rhs.Start; -} - -typedef std::vector ChunkVector; -typedef std::map BamBinMap; -typedef std::vector LinearOffsetVector; - -struct ReferenceIndex { - // data members - BamBinMap Bins; - LinearOffsetVector Offsets; - // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector()) - : Bins(binMap) - , Offsets(offsets) - { } -}; - -typedef std::vector BamIndex; - // ---------------------------------------------------------------- // BamAlignment member methods diff --git a/BamIndex.cpp b/BamIndex.cpp new file mode 100644 index 0000000..282326f --- /dev/null +++ b/BamIndex.cpp @@ -0,0 +1,922 @@ +#include +#include +#include +#include +#include +#include "BamIndex.h" +#include "BamReader.h" +#include "BGZF.h" +using namespace std; +using namespace BamTools; + +// ------------------------------- +// BamIndex implementation + +BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian) + : m_BGZF(bgzf) + , m_reader(reader) + , m_isBigEndian(isBigEndian) +{ + if ( m_reader && m_reader->IsOpen() ) + m_references = m_reader->GetReferenceData(); +} + +bool BamIndex::HasAlignments(const int& referenceID) { + + // return false if invalid ID + if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) ) + return false; + + // else return status of reference (has alignments?) + else + return m_references.at(referenceID).RefHasAlignments; +} + +// ######################################################################################### +// ######################################################################################### + +// ------------------------------- +// BamDefaultIndex structs & typedefs + +namespace BamTools { + +struct Chunk { + + // data members + uint64_t Start; + uint64_t Stop; + + // constructor + Chunk(const uint64_t& start = 0, + const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + { } +}; + +bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { + return lhs.Start < rhs.Start; +} + +typedef vector ChunkVector; +typedef map BamBinMap; +typedef vector LinearOffsetVector; + +struct ReferenceIndex { + + // data members + BamBinMap Bins; + LinearOffsetVector Offsets; + + // constructor + ReferenceIndex(const BamBinMap& binMap = BamBinMap(), + const LinearOffsetVector& offsets = LinearOffsetVector()) + : Bins(binMap) + , Offsets(offsets) + { } +}; + +typedef vector BamDefaultIndexData; + +} // namespace BamTools + +// ------------------------------- +// BamDefaultIndex implementation + +struct BamDefaultIndex::BamDefaultIndexPrivate { + + // ------------------------- + // data members + + BamDefaultIndexData m_indexData; + BamDefaultIndex* m_parent; + + // ------------------------- + // ctor & dtor + + BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { } + ~BamDefaultIndexPrivate(void) { } + + // ------------------------- + // internal methods + + // calculate bins that overlap region + int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); + // saves BAM bin entry for index + void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); + // saves linear offset entry for index + void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); + // simplifies index by merging 'chunks' + void MergeChunks(void); + +}; + +BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) + : BamIndex(bgzf, reader, isBigEndian) +{ + d = new BamDefaultIndexPrivate(this); +} + +BamDefaultIndex::~BamDefaultIndex(void) { + d->m_indexData.clear(); + delete d; + d = 0; +} + +// calculate bins that overlap region +int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { + + // get region boundaries + uint32_t begin = (unsigned int)region.LeftPosition; + uint32_t end; + + // if right bound specified AND left&right bounds are on same reference + // OK to use right bound position + if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) + end = (unsigned int)region.RightPosition; + + // otherwise, use end of left bound reference as cutoff + else + end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + bins[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } + + // return number of bins stored + return i; +} + +bool BamDefaultIndex::Build(void) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // move file pointer to beginning of alignments + m_reader->Rewind(); + + // get reference count, reserve index space + int numReferences = (int)m_references.size(); + for ( int i = 0; i < numReferences; ++i ) { + d->m_indexData.push_back(ReferenceIndex()); + } + + // sets default constant for bin, ID, offset, coordinate variables + const uint32_t defaultValue = 0xffffffffu; + + // bin data + uint32_t saveBin(defaultValue); + uint32_t lastBin(defaultValue); + + // reference ID data + int32_t saveRefID(defaultValue); + int32_t lastRefID(defaultValue); + + // offset data + uint64_t saveOffset = m_BGZF->Tell(); + uint64_t lastOffset = saveOffset; + + // coordinate data + int32_t lastCoordinate = defaultValue; + + BamAlignment bAlignment; + while ( m_reader->GetNextAlignmentCore(bAlignment) ) { + + // change of chromosome, save ID, reset bin + if ( lastRefID != bAlignment.RefID ) { + lastRefID = bAlignment.RefID; + lastBin = defaultValue; + } + + // if lastCoordinate greater than BAM position - file not sorted properly + else if ( lastCoordinate > bAlignment.Position ) { + printf("BAM file not properly sorted:\n"); + printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); + exit(1); + } + + // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) + if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { + + // save linear offset entry (matched to BAM entry refID) + ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID); + LinearOffsetVector& offsets = refIndex.Offsets; + d->InsertLinearOffset(offsets, bAlignment, lastOffset); + } + + // if current BamAlignment bin != lastBin, "then possibly write the binning index" + if ( bAlignment.Bin != lastBin ) { + + // if not first time through + if ( saveBin != defaultValue ) { + + // save Bam bin entry + ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); + BamBinMap& binMap = refIndex.Bins; + d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // update saveOffset + saveOffset = lastOffset; + + // update bin values + saveBin = bAlignment.Bin; + lastBin = bAlignment.Bin; + + // update saveRefID + saveRefID = bAlignment.RefID; + + // if invalid RefID, break out (why?) + if ( saveRefID < 0 ) { break; } + } + + // make sure that current file pointer is beyond lastOffset + if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { + printf("Error in BGZF offsets.\n"); + exit(1); + } + + // update lastOffset + lastOffset = m_BGZF->Tell(); + + // update lastCoordinate + lastCoordinate = bAlignment.Position; + } + + // save any leftover BAM data (as long as refID is valid) + if ( saveRefID >= 0 ) { + // save Bam bin entry + ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); + BamBinMap& binMap = refIndex.Bins; + d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // simplify index by merging chunks + d->MergeChunks(); + + // iterate through references in index + // store whether reference has data & + // sort offsets in linear offset vector + BamDefaultIndexData::iterator indexIter = d->m_indexData.begin(); + BamDefaultIndexData::iterator indexEnd = d->m_indexData.end(); + for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { + + // get reference index data + ReferenceIndex& refIndex = (*indexIter); + BamBinMap& binMap = refIndex.Bins; + LinearOffsetVector& offsets = refIndex.Offsets; + + // store whether reference has alignments or no + m_references[i].RefHasAlignments = ( binMap.size() > 0 ); + + // sort linear offsets + sort(offsets.begin(), offsets.end()); + } + + // rewind file pointer to beginning of alignments, return success/fail + return m_reader->Rewind(); +} + +bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { + + // calculate which bins overlap this region + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins); + + // get bins for this reference + const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID); + const BamBinMap& binMap = refIndex.Bins; + + // get minimum offset to consider + const LinearOffsetVector& linearOffsets = refIndex.Offsets; + uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); + + // store all alignment 'chunk' starts (file offsets) for bins in this region + for ( int i = 0; i < numBins; ++i ) { + + const uint16_t binKey = bins[i]; + map::const_iterator binIter = binMap.find(binKey); + if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { + + const ChunkVector& chunks = (*binIter).second; + std::vector::const_iterator chunksIter = chunks.begin(); + std::vector::const_iterator chunksEnd = chunks.end(); + for ( ; chunksIter != chunksEnd; ++chunksIter) { + + // if valid chunk found, store its file offset + const Chunk& chunk = (*chunksIter); + if ( chunk.Stop > minOffset ) + offsets.push_back( chunk.Start ); + } + } + } + + // clean up memory + free(bins); + + // sort the offsets before returning + sort(offsets.begin(), offsets.end()); + + // return whether any offsets were found + return ( offsets.size() != 0 ); +} + +// saves BAM bin entry for index +void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset) +{ + // look up saveBin + BamBinMap::iterator binIter = binMap.find(saveBin); + + // create new chunk + Chunk newChunk(saveOffset, lastOffset); + + // if entry doesn't exist + if ( binIter == binMap.end() ) { + ChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair(saveBin, newChunks)); + } + + // otherwise + else { + ChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} + +// saves linear offset entry for index +void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset) +{ + // get converted offsets + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) + offsets.resize(newSize, 0); + + // store offset + for( int i = beginOffset + 1; i <= endOffset; ++i ) { + if ( offsets[i] == 0 ) + offsets[i] = lastOffset; + } +} + +bool BamDefaultIndex::Load(const string& filename) { + + // open index file, abort on error + FILE* indexStream = fopen(filename.c_str(), "rb"); + if( !indexStream ) { + printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); + return false; + } + + // set placeholder to receive input byte count (suppresses compiler warnings) + size_t elementsRead = 0; + + // see if index is valid BAM index + char magic[4]; + elementsRead = fread(magic, 1, 4, indexStream); + if ( strncmp(magic, "BAI\1", 4) ) { + printf("Problem with index file - invalid format.\n"); + fclose(indexStream); + return false; + } + + // get number of reference sequences + uint32_t numRefSeqs; + elementsRead = fread(&numRefSeqs, 4, 1, indexStream); + if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); } + + // intialize space for BamDefaultIndexData data structure + d->m_indexData.reserve(numRefSeqs); + + // iterate over reference sequences + for ( unsigned int i = 0; i < numRefSeqs; ++i ) { + + // get number of bins for this reference sequence + int32_t numBins; + elementsRead = fread(&numBins, 4, 1, indexStream); + if ( m_isBigEndian ) { SwapEndian_32(numBins); } + + if ( numBins > 0 ) { + RefData& refEntry = m_references[i]; + refEntry.RefHasAlignments = true; + } + + // intialize BinVector + BamBinMap binMap; + + // iterate over bins for that reference sequence + for ( int j = 0; j < numBins; ++j ) { + + // get binID + uint32_t binID; + elementsRead = fread(&binID, 4, 1, indexStream); + + // get number of regionChunks in this bin + uint32_t numChunks; + elementsRead = fread(&numChunks, 4, 1, indexStream); + + if ( m_isBigEndian ) { + SwapEndian_32(binID); + SwapEndian_32(numChunks); + } + + // intialize ChunkVector + ChunkVector regionChunks; + regionChunks.reserve(numChunks); + + // iterate over regionChunks in this bin + for ( unsigned int k = 0; k < numChunks; ++k ) { + + // get chunk boundaries (left, right) + uint64_t left; + uint64_t right; + elementsRead = fread(&left, 8, 1, indexStream); + elementsRead = fread(&right, 8, 1, indexStream); + + if ( m_isBigEndian ) { + SwapEndian_64(left); + SwapEndian_64(right); + } + + // save ChunkPair + regionChunks.push_back( Chunk(left, right) ); + } + + // sort chunks for this bin + sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan ); + + // save binID, chunkVector for this bin + binMap.insert( pair(binID, regionChunks) ); + } + + // load linear index for this reference sequence + + // get number of linear offsets + int32_t numLinearOffsets; + elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); + if ( m_isBigEndian ) { SwapEndian_32(numLinearOffsets); } + + // intialize LinearOffsetVector + LinearOffsetVector offsets; + offsets.reserve(numLinearOffsets); + + // iterate over linear offsets for this reference sequeence + uint64_t linearOffset; + for ( int j = 0; j < numLinearOffsets; ++j ) { + // read a linear offset & store + elementsRead = fread(&linearOffset, 8, 1, indexStream); + if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } + offsets.push_back(linearOffset); + } + + // sort linear offsets + sort( offsets.begin(), offsets.end() ); + + // store index data for that reference sequence + d->m_indexData.push_back( ReferenceIndex(binMap, offsets) ); + } + + // close index file (.bai) and return + fclose(indexStream); + return true; +} + +// merges 'alignment chunks' in BAM bin (used for index building) +void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) { + + // iterate over reference enties + BamDefaultIndexData::iterator indexIter = m_indexData.begin(); + BamDefaultIndexData::iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + + // get BAM bin map for this reference + ReferenceIndex& refIndex = (*indexIter); + BamBinMap& bamBinMap = refIndex.Bins; + + // iterate over BAM bins + BamBinMap::iterator binIter = bamBinMap.begin(); + BamBinMap::iterator binEnd = bamBinMap.end(); + for ( ; binIter != binEnd; ++binIter ) { + + // get chunk vector for this bin + ChunkVector& binChunks = (*binIter).second; + if ( binChunks.size() == 0 ) { continue; } + + ChunkVector mergedChunks; + mergedChunks.push_back( binChunks[0] ); + + // iterate over chunks + int i = 0; + ChunkVector::iterator chunkIter = binChunks.begin(); + ChunkVector::iterator chunkEnd = binChunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentChunk' based on numeric index + Chunk& currentChunk = mergedChunks[i]; + + // get iteratorChunk based on vector iterator + Chunk& iteratorChunk = (*chunkIter); + + // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted) + if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) { + + // set currentChunk.Stop to iteratorChunk.Stop + currentChunk.Stop = iteratorChunk.Stop; + } + + // otherwise + else { + // set currentChunk + 1 to iteratorChunk + mergedChunks.push_back(iteratorChunk); + ++i; + } + } + + // saved merged chunk vector + (*binIter).second = mergedChunks; + } + } +} + +// writes in-memory index data out to file +// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) +bool BamDefaultIndex::Write(const std::string& bamFilename) { + + string indexFilename = bamFilename + ".bai"; + FILE* indexStream = fopen(indexFilename.c_str(), "wb"); + if ( indexStream == 0 ) { + printf("ERROR: Could not open file to save index.\n"); + return false; + } + + // write BAM index header + fwrite("BAI\1", 1, 4, indexStream); + + // write number of reference sequences + int32_t numReferenceSeqs = d->m_indexData.size(); + if ( m_isBigEndian ) { SwapEndian_32(numReferenceSeqs); } + fwrite(&numReferenceSeqs, 4, 1, indexStream); + + // iterate over reference sequences + BamDefaultIndexData::const_iterator indexIter = d->m_indexData.begin(); + BamDefaultIndexData::const_iterator indexEnd = d->m_indexData.end(); + for ( ; indexIter != indexEnd; ++ indexIter ) { + + // get reference index data + const ReferenceIndex& refIndex = (*indexIter); + const BamBinMap& binMap = refIndex.Bins; + const LinearOffsetVector& offsets = refIndex.Offsets; + + // write number of bins + int32_t binCount = binMap.size(); + if ( m_isBigEndian ) { SwapEndian_32(binCount); } + fwrite(&binCount, 4, 1, indexStream); + + // iterate over bins + BamBinMap::const_iterator binIter = binMap.begin(); + BamBinMap::const_iterator binEnd = binMap.end(); + for ( ; binIter != binEnd; ++binIter ) { + + // get bin data (key and chunk vector) + uint32_t binKey = (*binIter).first; + const ChunkVector& binChunks = (*binIter).second; + + // save BAM bin key + if ( m_isBigEndian ) { SwapEndian_32(binKey); } + fwrite(&binKey, 4, 1, indexStream); + + // save chunk count + int32_t chunkCount = binChunks.size(); + if ( m_isBigEndian ) { SwapEndian_32(chunkCount); } + fwrite(&chunkCount, 4, 1, indexStream); + + // iterate over chunks + ChunkVector::const_iterator chunkIter = binChunks.begin(); + ChunkVector::const_iterator chunkEnd = binChunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) { + + // get current chunk data + const Chunk& chunk = (*chunkIter); + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // save chunk offsets + fwrite(&start, 8, 1, indexStream); + fwrite(&stop, 8, 1, indexStream); + } + } + + // write linear offsets size + int32_t offsetSize = offsets.size(); + if ( m_isBigEndian ) { SwapEndian_32(offsetSize); } + fwrite(&offsetSize, 4, 1, indexStream); + + // iterate over linear offsets + LinearOffsetVector::const_iterator offsetIter = offsets.begin(); + LinearOffsetVector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset value + uint64_t linearOffset = (*offsetIter); + if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } + fwrite(&linearOffset, 8, 1, indexStream); + } + } + + // flush buffer, close file, and return success + fflush(indexStream); + fclose(indexStream); + return true; +} + +// ######################################################################################### +// ######################################################################################### + +// ------------------------------------- +// BamToolsIndex implementation + +namespace BamTools { + +struct BamToolsIndexEntry { + + // data members + int64_t Offset; + int RefID; + int Position; + + // ctor + BamToolsIndexEntry(const uint64_t& offset = 0, + const int& id = -1, + const int& position = -1) + : Offset(offset) + , RefID(id) + , Position(position) + { } +}; + +typedef vector BamToolsIndexData; + +} // namespace BamTools + +struct BamToolsIndex::BamToolsIndexPrivate { + + // ------------------------- + // data members + BamToolsIndexData m_indexData; + BamToolsIndex* m_parent; + int32_t m_blockSize; + + // ------------------------- + // ctor & dtor + + BamToolsIndexPrivate(BamToolsIndex* parent) + : m_parent(parent) + , m_blockSize(1000) + { } + + ~BamToolsIndexPrivate(void) { } + + // ------------------------- + // internal methods +}; + +BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) + : BamIndex(bgzf, reader, isBigEndian) +{ + d = new BamToolsIndexPrivate(this); +} + +BamToolsIndex::~BamToolsIndex(void) { + delete d; + d = 0; +} + +bool BamToolsIndex::Build(void) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // move file pointer to beginning of alignments + m_reader->Rewind(); + + // plow through alignments, store block offsets + int32_t currentBlockCount = 0; + int64_t blockStartOffset = m_BGZF->Tell(); + int blockStartId = -1; + int blockStartPosition = -1; + BamAlignment al; + while ( m_reader->GetNextAlignmentCore(al) ) { + + // set reference flag + m_references[al.RefID].RefHasAlignments = true; + + // if beginning of block, save first alignment's refID & position + if ( currentBlockCount == 0 ) { + blockStartId = al.RefID; + blockStartPosition = al.Position; + } + + // increment block counter + ++currentBlockCount; + + // if block is full, get offset for next block, reset currentBlockCount + if ( currentBlockCount == d->m_blockSize ) { + +// cerr << "-------------------------------" << endl; +// cerr << "BlockCount = " << currentBlockCount << endl; +// cerr << endl; +// cerr << "Storing entry: " << endl; +// cerr << "\trefID : " << blockStartId << endl; +// cerr << "\tpos : " << blockStartPosition << endl; +// cerr << "\toffset : " << blockStartOffset << endl; +// + + d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) ); + blockStartOffset = m_BGZF->Tell(); + currentBlockCount = 0; + } + } + + return m_reader->Rewind(); +} + +// N.B. - ignores isRightBoundSpecified +bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { + + // return false if no index data present + if ( d->m_indexData.empty() ) return false; + + // clear any prior data + offsets.clear(); + + // calculate nearest index to jump to + int64_t previousOffset = -1; + BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); + BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + + const BamToolsIndexEntry& entry = (*indexIter); + + // check if we are 'past' beginning of desired region + // if so, we will break out & use previously stored offset + if ( entry.RefID > region.LeftRefID ) break; + if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break; + + // not past desired region, so store current entry offset in previousOffset + previousOffset = entry.Offset; + } + + // no index was found + if ( previousOffset == -1 ) + return false; + + // store offset & return success +/* cerr << "BTI::GetOffsets() : calculated offset = " << previousOffset << endl;*/ + offsets.push_back(previousOffset); + return true; +} + +bool BamToolsIndex::Load(const string& filename) { + + // open index file, abort on error + FILE* indexStream = fopen(filename.c_str(), "rb"); + if( !indexStream ) { + printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); + return false; + } + + // set placeholder to receive input byte count (suppresses compiler warnings) + size_t elementsRead = 0; + + // see if index is valid BAM index + char magic[4]; + elementsRead = fread(magic, 1, 4, indexStream); + if ( strncmp(magic, "BTI\1", 4) ) { + printf("Problem with index file - invalid format.\n"); + fclose(indexStream); + return false; + } + + // read in block size + elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream); + if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); } + + // read in number of offsets + uint32_t numOffsets; + elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); + if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } + + // reserve space for index data + d->m_indexData.reserve(numOffsets); + + // iterate over index entries + for ( unsigned int i = 0; i < numOffsets; ++i ) { + + uint64_t offset; + int id; + int position; + + // read in data + elementsRead = fread(&offset, sizeof(offset), 1, indexStream); + elementsRead = fread(&id, sizeof(id), 1, indexStream); + elementsRead = fread(&position, sizeof(position), 1, indexStream); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(offset); + SwapEndian_32(id); + SwapEndian_32(position); + } + + // save reference index entry + d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) ); + + // set reference flag + m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag? + } + + // close index file and return + fclose(indexStream); + return true; +} + +// writes in-memory index data out to file +// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) +bool BamToolsIndex::Write(const std::string& bamFilename) { + + string indexFilename = bamFilename + ".bti"; + FILE* indexStream = fopen(indexFilename.c_str(), "wb"); + if ( indexStream == 0 ) { + printf("ERROR: Could not open file to save index.\n"); + return false; + } + + // write BAM index header + fwrite("BTI\1", 1, 4, indexStream); + + // write block size + int32_t blockSize = d->m_blockSize; + if ( m_isBigEndian ) { SwapEndian_32(blockSize); } + fwrite(&blockSize, sizeof(blockSize), 1, indexStream); + + // write number of offset entries + uint32_t numOffsets = d->m_indexData.size(); + if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } + fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); + + // iterate over offset entries + BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); + BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); + for ( ; indexIter != indexEnd; ++ indexIter ) { + + // get reference index data + const BamToolsIndexEntry& entry = (*indexIter); + + // copy entry data + uint64_t offset = entry.Offset; + int id = entry.RefID; + int position = entry.Position; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(offset); + SwapEndian_32(id); + SwapEndian_32(position); + } + + // write the reference index entry + fwrite(&offset, sizeof(offset), 1, indexStream); + fwrite(&id, sizeof(id), 1, indexStream); + fwrite(&position, sizeof(position), 1, indexStream); + } + + // flush file buffer, close file, and return success + fflush(indexStream); + fclose(indexStream); + return true; +} diff --git a/BamIndex.h b/BamIndex.h new file mode 100644 index 0000000..99dd095 --- /dev/null +++ b/BamIndex.h @@ -0,0 +1,105 @@ +#ifndef BAM_INDEX_H +#define BAM_INDEX_H + +#include +#include +#include "BamAux.h" + +namespace BamTools { + +class BamReader; +class BgzfData; + +// BamIndex base class +class BamIndex { + + public: + BamIndex(BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + bool isBigEndian); + virtual ~BamIndex(void) { } + + public: + // creates index data (in-memory) from current reader data + virtual bool Build(void) =0; + // calculates offset(s) for a given region + virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets) =0; + // loads existing data from file into memory + virtual bool Load(const std::string& filename) =0; + // returns whether reference has alignments or no + virtual bool HasAlignments(const int& referenceID); + // writes in-memory index data out to file + // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) + virtual bool Write(const std::string& bamFilename) =0; + + protected: + BamTools::BgzfData* m_BGZF; + BamTools::BamReader* m_reader; + BamTools::RefVector m_references; + bool m_isBigEndian; +}; + +// BamDefaultIndex class +// +// implements default (per SAM/BAM spec) index file ops +class BamDefaultIndex : public BamIndex { + + + // ctor & dtor + public: + BamDefaultIndex(BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + bool isBigEndian); + ~BamDefaultIndex(void); + + // interface (implements BamIndex virtual methods) + public: + // creates index data (in-memory) from current reader data + bool Build(void); + // calculates offset(s) for a given region + bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); + // loads existing data from file into memory + bool Load(const std::string& filename); + // writes in-memory index data out to file + // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) + bool Write(const std::string& bamFilename); + + // internal implementation + private: + struct BamDefaultIndexPrivate; + BamDefaultIndexPrivate* d; +}; + +// BamToolsIndex class +// +// implements BamTools-specific index file ops +class BamToolsIndex : public BamIndex { + + // ctor & dtor + public: + BamToolsIndex(BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + bool isBigEndian); + ~BamToolsIndex(void); + + // interface (implements BamIndex virtual methods) + public: + // creates index data (in-memory) from current reader data + bool Build(void); + // calculates offset(s) for a given region + bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); + // loads existing data from file into memory + bool Load(const std::string& filename); + // writes in-memory index data out to file + // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) + bool Write(const std::string& bamFilename); + + // internal implementation + private: + struct BamToolsIndexPrivate; + BamToolsIndexPrivate* d; +}; + +} // namespace BamTools + +#endif // BAM_INDEX_H \ No newline at end of file diff --git a/BamMultiReader.cpp b/BamMultiReader.cpp index 6d80323..870040d 100644 --- a/BamMultiReader.cpp +++ b/BamMultiReader.cpp @@ -211,36 +211,52 @@ void BamMultiReader::UpdateAlignments(void) { } // opens BAM files -void BamMultiReader::Open(const vector filenames, bool openIndexes, bool coreMode) { +void BamMultiReader::Open(const vector filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { + // for filename in filenames fileNames = filenames; // save filenames in our multireader for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { string filename = *it; BamReader* reader = new BamReader; - // TODO; change Open to return success/failure so we can report errors here + bool openedOK = true; if (openIndexes) { - reader->Open(filename, filename + ".bai"); + if (useDefaultIndex) + openedOK = reader->Open(filename, filename + ".bai"); + else + openedOK = reader->Open(filename, filename + ".bti"); } else { - reader->Open(filename); // for merging, jumping is disallowed + openedOK = reader->Open(filename); // for merging, jumping is disallowed } - - bool fileOK = true; - BamAlignment* alignment = new BamAlignment; - if (coreMode) { - fileOK &= reader->GetNextAlignmentCore(*alignment); - } else { - fileOK &= reader->GetNextAlignment(*alignment); - } - if (fileOK) { - readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { - cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; + + // if file opened ok, check that it can be read + if ( openedOK ) { + + bool fileOK = true; + BamAlignment* alignment = new BamAlignment; + if (coreMode) { + fileOK &= reader->GetNextAlignmentCore(*alignment); + } else { + fileOK &= reader->GetNextAlignment(*alignment); + } + + if (fileOK) { + readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { + cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; + } + + } + + // TODO; error handling on openedOK == false + else { + + } - } + ValidateReaders(); } @@ -269,11 +285,11 @@ bool BamMultiReader::Rewind(void) { } // saves index data to BAM index files (".bai") where necessary, returns success/fail -bool BamMultiReader::CreateIndexes(void) { +bool BamMultiReader::CreateIndexes(bool useDefaultIndex) { bool result = true; for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; - result &= reader->CreateIndex(); + result &= reader->CreateIndex(useDefaultIndex); } return result; } diff --git a/BamMultiReader.h b/BamMultiReader.h index e5960df..37ad765 100644 --- a/BamMultiReader.h +++ b/BamMultiReader.h @@ -59,7 +59,7 @@ class BamMultiReader { // indexes. // @coreMode - setup our first alignments using GetNextAlignmentCore(); // also useful for merging - void Open(const vector filenames, bool openIndexes = true, bool coreMode = false); + void Open(const vector filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true); // performs random-access jump to reference, position bool Jump(int refID, int position = 0); @@ -106,7 +106,7 @@ class BamMultiReader { // ---------------------- // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") - bool CreateIndexes(void); + bool CreateIndexes(bool useDefaultIndex = true); //const int GetReferenceID(const string& refName) const; diff --git a/BamReader.cpp b/BamReader.cpp index 6ebc488..c618b41 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 June 2010 (DB) +// Last modified: 30 June 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -21,6 +21,7 @@ // BamTools includes #include "BGZF.h" #include "BamReader.h" +#include "BamIndex.h" using namespace BamTools; using namespace std; @@ -30,9 +31,9 @@ struct BamReader::BamReaderPrivate { // structs, enums, typedefs // ------------------------------- enum RegionState { BEFORE_REGION = 0 - , WITHIN_REGION - , AFTER_REGION - }; + , WITHIN_REGION + , AFTER_REGION + }; // ------------------------------- // data members @@ -41,7 +42,8 @@ struct BamReader::BamReaderPrivate { // general file data BgzfData mBGZF; string HeaderText; - BamIndex Index; + //BamIndex Index; + BamIndex* NewIndex; RefVector References; bool IsIndexLoaded; int64_t AlignmentsBeginOffset; @@ -60,6 +62,9 @@ struct BamReader::BamReaderPrivate { int CurrentRefID; int CurrentLeft; + // parent BamReader + BamReader* Parent; + // BAM character constants const char* DNA_LOOKUP; const char* CIGAR_LOOKUP; @@ -67,7 +72,7 @@ struct BamReader::BamReaderPrivate { // ------------------------------- // constructor & destructor // ------------------------------- - BamReaderPrivate(void); + BamReaderPrivate(BamReader* parent); ~BamReaderPrivate(void); // ------------------------------- @@ -89,7 +94,7 @@ struct BamReader::BamReaderPrivate { int GetReferenceID(const string& refName) const; // index operations - bool CreateIndex(void); + bool CreateIndex(bool useDefaultIndex); // ------------------------------- // internal methods @@ -97,12 +102,8 @@ struct BamReader::BamReaderPrivate { // *** reading alignments and auxiliary data *** // - // calculate bins that overlap region - int BinsFromRegion(uint16_t bins[MAX_BIN]); // fills out character data for BamAlignment data bool BuildCharData(BamAlignment& bAlignment); - // calculate file offset for first alignment chunk overlapping specified region - int64_t GetOffset(std::vector& chunkStarts); // checks to see if alignment overlaps current region RegionState IsOverlap(BamAlignment& bAlignment); // retrieves header text from BAM file @@ -114,29 +115,18 @@ struct BamReader::BamReaderPrivate { // *** index file handling *** // - // calculates index for BAM file - bool BuildIndex(void); // clear out inernal index data structure void ClearIndex(void); - // saves BAM bin entry for index - void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); - // saves linear offset entry for index - void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); // loads index from BAM index file bool LoadIndex(void); - // simplifies index by merging 'chunks' - void MergeChunks(void); - // saves index to BAM index file - bool WriteIndex(void); }; // ----------------------------------------------------- // BamReader implementation (wrapper around BRPrivate) // ----------------------------------------------------- - // constructor BamReader::BamReader(void) { - d = new BamReaderPrivate; + d = new BamReaderPrivate(this); } // destructor @@ -147,6 +137,7 @@ BamReader::~BamReader(void) { // file operations void BamReader::Close(void) { d->Close(); } +bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } bool BamReader::Jump(int refID, int position) { d->Region.LeftRefID = refID; d->Region.LeftPosition = position; @@ -168,26 +159,28 @@ bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNe // access auxiliary data const string BamReader::GetHeaderText(void) const { return d->HeaderText; } int BamReader::GetReferenceCount(void) const { return d->References.size(); } -const RefVector BamReader::GetReferenceData(void) const { return d->References; } +const RefVector& BamReader::GetReferenceData(void) const { return d->References; } int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } const std::string BamReader::GetFilename(void) const { return d->Filename; } // index operations -bool BamReader::CreateIndex(void) { return d->CreateIndex(); } +bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); } // ----------------------------------------------------- // BamReaderPrivate implementation // ----------------------------------------------------- // constructor -BamReader::BamReaderPrivate::BamReaderPrivate(void) - : IsIndexLoaded(false) +BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent) + : NewIndex(0) + , IsIndexLoaded(false) , AlignmentsBeginOffset(0) , IsLeftBoundSpecified(false) , IsRightBoundSpecified(false) , IsRegionSpecified(false) , CurrentRefID(0) , CurrentLeft(0) + , Parent(parent) , DNA_LOOKUP("=ACMGRSVTWYHKDBN") , CIGAR_LOOKUP("MIDNSHP") { @@ -199,53 +192,44 @@ BamReader::BamReaderPrivate::~BamReaderPrivate(void) { Close(); } -// calculate bins that overlap region -int BamReader::BamReaderPrivate::BinsFromRegion(uint16_t list[MAX_BIN]) { - - // get region boundaries - uint32_t begin = (unsigned int)Region.LeftPosition; - uint32_t end; - - // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position - if ( IsRightBoundSpecified && ( Region.LeftRefID == Region.RightRefID ) ) - end = (unsigned int)Region.RightPosition; // -1 ?? - - // otherwise, use end of left bound reference as cutoff - else - end = (unsigned int)References.at(Region.LeftRefID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - list[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; } - - // return number of bins stored - return i; -} - bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { // calculate character lengths/offsets - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; - const unsigned int tagDataLength = dataLength - tagDataOffset; + const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; + const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; // set up char buffers - const char* allCharData = bAlignment.SupportData.AllCharData.data(); - const char* seqData = ((const char*)allCharData) + seqDataOffset; - const char* qualData = ((const char*)allCharData) + qualDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; + const char* allCharData = bAlignment.SupportData.AllCharData.data(); + uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + const char* seqData = ((const char*)allCharData) + seqDataOffset; + const char* qualData = ((const char*)allCharData) + qualDataOffset; + char* tagData = ((char*)allCharData) + tagDataOffset; + // store alignment name (depends on null char as terminator) + bAlignment.Name.assign((const char*)(allCharData)); + + // save CigarOps + CigarOp op; + bAlignment.CigarData.clear(); + bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); + for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { + + // swap if necessary + if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } + + // build CigarOp structure + op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); + op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + + // save CigarOp + bAlignment.CigarData.push_back(op); + } + + // save query sequence bAlignment.QueryBases.clear(); bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); @@ -365,163 +349,47 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { return true; } -// populates BAM index data structure from BAM file data -bool BamReader::BamReaderPrivate::BuildIndex(void) { - - // check to be sure file is open - if (!mBGZF.IsOpen) { return false; } - - // move file pointer to beginning of alignments - Rewind(); - - // get reference count, reserve index space - int numReferences = References.size(); - for ( int i = 0; i < numReferences; ++i ) { - Index.push_back(ReferenceIndex()); - } - - // sets default constant for bin, ID, offset, coordinate variables - const uint32_t defaultValue = 0xffffffffu; - - // bin data - uint32_t saveBin(defaultValue); - uint32_t lastBin(defaultValue); - - // reference ID data - int32_t saveRefID(defaultValue); - int32_t lastRefID(defaultValue); - - // offset data - uint64_t saveOffset = mBGZF.Tell(); - uint64_t lastOffset = saveOffset; - - // coordinate data - int32_t lastCoordinate = defaultValue; - - BamAlignment bAlignment; - while( GetNextAlignment(bAlignment) ) { - - // change of chromosome, save ID, reset bin - if ( lastRefID != bAlignment.RefID ) { - lastRefID = bAlignment.RefID; - lastBin = defaultValue; - } - - // if lastCoordinate greater than BAM position - file not sorted properly - else if ( lastCoordinate > bAlignment.Position ) { - printf("BAM file not properly sorted:\n"); - printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); - exit(1); - } - - // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) - if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { - - // save linear offset entry (matched to BAM entry refID) - ReferenceIndex& refIndex = Index.at(bAlignment.RefID); - LinearOffsetVector& offsets = refIndex.Offsets; - InsertLinearOffset(offsets, bAlignment, lastOffset); - } - - // if current BamAlignment bin != lastBin, "then possibly write the binning index" - if ( bAlignment.Bin != lastBin ) { - - // if not first time through - if ( saveBin != defaultValue ) { - - // save Bam bin entry - ReferenceIndex& refIndex = Index.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // update saveOffset - saveOffset = lastOffset; - - // update bin values - saveBin = bAlignment.Bin; - lastBin = bAlignment.Bin; - - // update saveRefID - saveRefID = bAlignment.RefID; - - // if invalid RefID, break out (why?) - if ( saveRefID < 0 ) { break; } - } - - // make sure that current file pointer is beyond lastOffset - if ( mBGZF.Tell() <= (int64_t)lastOffset ) { - printf("Error in BGZF offsets.\n"); - exit(1); - } - - // update lastOffset - lastOffset = mBGZF.Tell(); - - // update lastCoordinate - lastCoordinate = bAlignment.Position; - } - - // save any leftover BAM data (as long as refID is valid) - if ( saveRefID >= 0 ) { - // save Bam bin entry - ReferenceIndex& refIndex = Index.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // simplify index by merging chunks - MergeChunks(); - - // iterate over references - BamIndex::iterator indexIter = Index.begin(); - BamIndex::iterator indexEnd = Index.end(); - for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { - - // get reference index data - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& binMap = refIndex.Bins; - LinearOffsetVector& offsets = refIndex.Offsets; - - // store whether reference has alignments or no - References[i].RefHasAlignments = ( binMap.size() > 0 ); - - // sort linear offsets - sort(offsets.begin(), offsets.end()); - } - - - // rewind file pointer to beginning of alignments, return success/fail - return Rewind(); -} - - // clear index data structure void BamReader::BamReaderPrivate::ClearIndex(void) { - Index.clear(); // sufficient ?? + delete NewIndex; + NewIndex = 0; } // closes the BAM file void BamReader::BamReaderPrivate::Close(void) { + + // close BGZF file stream mBGZF.Close(); + + // clear out index data ClearIndex(); + + // clear out header data HeaderText.clear(); - IsLeftBoundSpecified = false; + + // clear out region flags + IsLeftBoundSpecified = false; IsRightBoundSpecified = false; - IsRegionSpecified = false; + IsRegionSpecified = false; } // create BAM index from BAM file (keep structure in memory) and write to default index output file -bool BamReader::BamReaderPrivate::CreateIndex(void) { +bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) { - // clear out index + // clear out prior index data ClearIndex(); - - // build (& save) index from BAM file + + // create default index + if ( useDefaultIndex ) + NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); + // create BamTools 'custom' index + else + NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); + bool ok = true; - ok &= BuildIndex(); - ok &= WriteIndex(); - + ok &= NewIndex->Build(); + ok &= NewIndex->Write(Filename); + // return success/fail return ok; } @@ -575,51 +443,6 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) return false; } -// calculate file offset for first alignment chunk overlapping specified region -int64_t BamReader::BamReaderPrivate::GetOffset(std::vector& chunkStarts) { - - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = BinsFromRegion(bins); - - // get bins for this reference - const ReferenceIndex& refIndex = Index.at(Region.LeftRefID); - const BamBinMap& binMap = refIndex.Bins; - - // get minimum offset to consider - const LinearOffsetVector& offsets = refIndex.Offsets; - uint64_t minOffset = ( (unsigned int)(Region.LeftPosition>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(Region.LeftPosition>>BAM_LIDX_SHIFT); - - // store offsets to beginning of alignment 'chunks' - //std::vector chunkStarts; - - // store all alignment 'chunk' starts for bins in this region - for (int i = 0; i < numBins; ++i ) { - - const uint16_t binKey = bins[i]; - map::const_iterator binIter = binMap.find(binKey); - if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { - - const ChunkVector& chunks = (*binIter).second; - std::vector::const_iterator chunksIter = chunks.begin(); - std::vector::const_iterator chunksEnd = chunks.end(); - for ( ; chunksIter != chunksEnd; ++chunksIter) { - const Chunk& chunk = (*chunksIter); - if ( chunk.Stop > minOffset ) { - chunkStarts.push_back( chunk.Start ); - } - } - } - } - - // clean up memory - free(bins); - - // if no alignments found, else return smallest offset for alignment starts - if ( chunkStarts.size() == 0 ) { return -1; } - else { return *min_element(chunkStarts.begin(), chunkStarts.end()); } -} - // returns RefID for given RefName (returns References.size() if not found) int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { @@ -635,54 +458,6 @@ int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); } -// saves BAM bin entry for index -void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) -{ - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); - - // if entry doesn't exist - if ( binIter == binMap.end() ) { - ChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair(saveBin, newChunks)); - } - - // otherwise - else { - ChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } -} - -// saves linear offset entry for index -void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) -{ - // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) { offsets.resize(newSize, 0); } - - // store offset - for(int i = beginOffset + 1; i <= endOffset ; ++i) { - if ( offsets[i] == 0) { - offsets[i] = lastOffset; - } - } -} - // returns region state - whether alignment ends before, overlaps, or starts after currently specified region // this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { @@ -728,44 +503,37 @@ BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap( // jumps to specified region(refID, leftBound) in BAM file, returns success/fail bool BamReader::BamReaderPrivate::Jump(int refID, int position) { - // if data exists for this reference and position is valid - if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { - - // calculate offset - std::vector chunkStarts; - int64_t offset = GetOffset(chunkStarts); - sort(chunkStarts.begin(), chunkStarts.end()); - - // if in valid offset, return failure - // otherwise return success of seek operation - if ( offset == -1 ) { - return false; - } else { - //return mBGZF.Seek(offset); - BamAlignment bAlignment; - bool result = true; - for (std::vector::const_iterator o = chunkStarts.begin(); o != chunkStarts.end(); ++o) { - // std::cerr << *o << endl; - // std::cerr << "Seeking to offset: " << *o << endl; - result &= mBGZF.Seek(*o); - LoadNextAlignment(bAlignment); - // std::cerr << "alignment: " << bAlignment.RefID - // << ":" << bAlignment.Position << ".." << bAlignment.Position + bAlignment.Length << endl; - if ((bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || bAlignment.RefID > refID) { - // std::cerr << "here i am" << endl; - // std::cerr << "seeking to offset: " << (*(o-1)) << endl; - // std::cerr << "*** Finished jumping ***" << endl; - return mBGZF.Seek(*o); - } - } - - //std::cerr << "*** Finished jumping ***" << endl; - return result; - } + // ----------------------------------------------------------------------- + // check for existing index + if ( NewIndex == 0 ) return false; + // see if reference has alignments + if ( !NewIndex->HasAlignments(refID) ) return false; + // make sure position is valid + if ( position > References.at(refID).RefLength ) return false; + + // determine possible offsets + vector offsets; + if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) { + printf("ERROR: Could not jump: unable to calculate offset for specified region.\n"); + return false; } - - // invalid jump request parameters, return failure - return false; + + // iterate through offsets + BamAlignment bAlignment; + bool result = true; + for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { + + // attempt seek & load first available alignment + result &= mBGZF.Seek(*o); + LoadNextAlignment(bAlignment); + + // if this alignment corresponds to desired position + // return success of seeking back to 'current offset' + if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) + return mBGZF.Seek(*o); + } + + return result; } // load BAM header data @@ -800,129 +568,33 @@ void BamReader::BamReaderPrivate::LoadHeaderData(void) { // load existing index data from BAM index file (".bai"), return success/fail bool BamReader::BamReaderPrivate::LoadIndex(void) { - // clear out index data + // clear out any existing index data ClearIndex(); // skip if index file empty - if ( IndexFilename.empty() ) { return false; } - - // open index file, abort on error - FILE* indexStream = fopen(IndexFilename.c_str(), "rb"); - if(!indexStream) { - printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() ); + if ( IndexFilename.empty() ) return false; - } - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if (strncmp(magic, "BAI\1", 4)) { - printf("Problem with index file - invalid format.\n"); - fclose(indexStream); + // check supplied filename for index type + size_t defaultExtensionFound = IndexFilename.find(".bai"); + size_t customExtensionFound = IndexFilename.find(".bti"); + + // if SAM/BAM default (".bai") + if ( defaultExtensionFound != string::npos ) + NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); + + // if BamTools custom index (".bti") + else if ( customExtensionFound != string::npos ) + NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); + + // else unknown + else { + printf("ERROR: Unknown index file extension.\n"); return false; } - - // get number of reference sequences - uint32_t numRefSeqs; - elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); } - // intialize space for BamIndex data structure - Index.reserve(numRefSeqs); - - // iterate over reference sequences - for (unsigned int i = 0; i < numRefSeqs; ++i) { - - // get number of bins for this reference sequence - int32_t numBins; - elementsRead = fread(&numBins, 4, 1, indexStream); - if ( IsBigEndian ) { SwapEndian_32(numBins); } - - if (numBins > 0) { - RefData& refEntry = References[i]; - refEntry.RefHasAlignments = true; - } - - // intialize BinVector - BamBinMap binMap; - - // iterate over bins for that reference sequence - for (int j = 0; j < numBins; ++j) { - - // get binID - uint32_t binID; - elementsRead = fread(&binID, 4, 1, indexStream); - - // get number of regionChunks in this bin - uint32_t numChunks; - elementsRead = fread(&numChunks, 4, 1, indexStream); - - if ( IsBigEndian ) { - SwapEndian_32(binID); - SwapEndian_32(numChunks); - } - - // intialize ChunkVector - ChunkVector regionChunks; - regionChunks.reserve(numChunks); - - // iterate over regionChunks in this bin - for (unsigned int k = 0; k < numChunks; ++k) { - - // get chunk boundaries (left, right) - uint64_t left; - uint64_t right; - elementsRead = fread(&left, 8, 1, indexStream); - elementsRead = fread(&right, 8, 1, indexStream); - - if ( IsBigEndian ) { - SwapEndian_64(left); - SwapEndian_64(right); - } - - // save ChunkPair - regionChunks.push_back( Chunk(left, right) ); - } - - // sort chunks for this bin - sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan ); - - // save binID, chunkVector for this bin - binMap.insert( pair(binID, regionChunks) ); - } - - // load linear index for this reference sequence - - // get number of linear offsets - int32_t numLinearOffsets; - elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); - if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); } - - // intialize LinearOffsetVector - LinearOffsetVector offsets; - offsets.reserve(numLinearOffsets); - - // iterate over linear offsets for this reference sequeence - uint64_t linearOffset; - for (int j = 0; j < numLinearOffsets; ++j) { - // read a linear offset & store - elementsRead = fread(&linearOffset, 8, 1, indexStream); - if ( IsBigEndian ) { SwapEndian_64(linearOffset); } - offsets.push_back(linearOffset); - } - - // sort linear offsets - sort( offsets.begin(), offsets.end() ); - - // store index data for that reference sequence - Index.push_back( ReferenceIndex(binMap, offsets) ); - } - - // close index file (.bai) and return - fclose(indexStream); - return true; + // return success of loading index data + return NewIndex->Load(IndexFilename); } // populates BamAlignment with alignment data under file pointer, returns success/fail @@ -963,44 +635,25 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - // store 'all char data' and cigar ops - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; - - char* allCharData = (char*)calloc(sizeof(char), dataLength); - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + // set BamAlignment length + bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; // read in character data - make sure proper data size was read - if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; } - else { - - // store alignment name and length - bAlignment.Name.assign((const char*)(allCharData)); - bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; + bool readCharDataOK = false; + const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + char* allCharData = (char*)calloc(sizeof(char), dataLength); + + if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { - // store remaining 'allCharData' in supportData structure + // store 'allCharData' in supportData structure bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); - // save CigarOps for BamAlignment - CigarOp op; - bAlignment.CigarData.clear(); - bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); - for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { - - // swap if necessary - if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } - - // build CigarOp structure - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - - // save CigarOp - bAlignment.CigarData.push_back(op); - } + // set success flag + readCharDataOK = true; } free(allCharData); - return true; + return readCharDataOK; } // loads reference data from BAM file @@ -1040,63 +693,6 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { } } -// merges 'alignment chunks' in BAM bin (used for index building) -void BamReader::BamReaderPrivate::MergeChunks(void) { - - // iterate over reference enties - BamIndex::iterator indexIter = Index.begin(); - BamIndex::iterator indexEnd = Index.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& bamBinMap = refIndex.Bins; - - // iterate over BAM bins - BamBinMap::iterator binIter = bamBinMap.begin(); - BamBinMap::iterator binEnd = bamBinMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get chunk vector for this bin - ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) { continue; } - - ChunkVector mergedChunks; - mergedChunks.push_back( binChunks[0] ); - - // iterate over chunks - int i = 0; - ChunkVector::iterator chunkIter = binChunks.begin(); - ChunkVector::iterator chunkEnd = binChunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentChunk' based on numeric index - Chunk& currentChunk = mergedChunks[i]; - - // get iteratorChunk based on vector iterator - Chunk& iteratorChunk = (*chunkIter); - - // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted) - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) { - - // set currentChunk.Stop to iteratorChunk.Stop - currentChunk.Stop = iteratorChunk.Stop; - } - - // otherwise - else { - // set currentChunk + 1 to iteratorChunk - mergedChunks.push_back(iteratorChunk); - ++i; - } - } - - // saved merged chunk vector - (*binIter).second = mergedChunks; - } - } -} - // opens BAM file (and index) bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) { @@ -1115,9 +711,8 @@ bool BamReader::BamReaderPrivate::Open(const string& filename, const string& ind AlignmentsBeginOffset = mBGZF.Tell(); // open index file & load index data (if exists) - if ( !IndexFilename.empty() ) { + if ( !IndexFilename.empty() ) LoadIndex(); - } // return success return true; @@ -1125,12 +720,13 @@ bool BamReader::BamReaderPrivate::Open(const string& filename, const string& ind // returns BAM file pointer to beginning of alignment data bool BamReader::BamReaderPrivate::Rewind(void) { - + // find first reference that has alignments in the BAM file int refID = 0; int refCount = References.size(); for ( ; refID < refCount; ++refID ) { - if ( References.at(refID).RefHasAlignments ) { break; } + if ( References.at(refID).RefHasAlignments ) + break; } // reset default region info @@ -1162,98 +758,3 @@ bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { // attempt jump to beginning of region, return success/fail of Jump() return Jump( Region.LeftRefID, Region.LeftPosition ); } - -// saves index data to BAM index file (".bai"), returns success/fail -bool BamReader::BamReaderPrivate::WriteIndex(void) { - - IndexFilename = Filename + ".bai"; - FILE* indexStream = fopen(IndexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - printf("ERROR: Could not open file to save index\n"); - return false; - } - - // write BAM index header - fwrite("BAI\1", 1, 4, indexStream); - - // write number of reference sequences - int32_t numReferenceSeqs = Index.size(); - if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); } - fwrite(&numReferenceSeqs, 4, 1, indexStream); - - // iterate over reference sequences - BamIndex::const_iterator indexIter = Index.begin(); - BamIndex::const_iterator indexEnd = Index.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { - - // get reference index data - const ReferenceIndex& refIndex = (*indexIter); - const BamBinMap& binMap = refIndex.Bins; - const LinearOffsetVector& offsets = refIndex.Offsets; - - // write number of bins - int32_t binCount = binMap.size(); - if ( IsBigEndian ) { SwapEndian_32(binCount); } - fwrite(&binCount, 4, 1, indexStream); - - // iterate over bins - BamBinMap::const_iterator binIter = binMap.begin(); - BamBinMap::const_iterator binEnd = binMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get bin data (key and chunk vector) - uint32_t binKey = (*binIter).first; - const ChunkVector& binChunks = (*binIter).second; - - // save BAM bin key - if ( IsBigEndian ) { SwapEndian_32(binKey); } - fwrite(&binKey, 4, 1, indexStream); - - // save chunk count - int32_t chunkCount = binChunks.size(); - if ( IsBigEndian ) { SwapEndian_32(chunkCount); } - fwrite(&chunkCount, 4, 1, indexStream); - - // iterate over chunks - ChunkVector::const_iterator chunkIter = binChunks.begin(); - ChunkVector::const_iterator chunkEnd = binChunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) { - - // get current chunk data - const Chunk& chunk = (*chunkIter); - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - if ( IsBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save chunk offsets - fwrite(&start, 8, 1, indexStream); - fwrite(&stop, 8, 1, indexStream); - } - } - - // write linear offsets size - int32_t offsetSize = offsets.size(); - if ( IsBigEndian ) { SwapEndian_32(offsetSize); } - fwrite(&offsetSize, 4, 1, indexStream); - - // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // write linear offset value - uint64_t linearOffset = (*offsetIter); - if ( IsBigEndian ) { SwapEndian_64(linearOffset); } - fwrite(&linearOffset, 8, 1, indexStream); - } - } - - // flush buffer, close file, and return success - fflush(indexStream); - fclose(indexStream); - return true; -} diff --git a/BamReader.h b/BamReader.h index 92de17e..33ee9b4 100644 --- a/BamReader.h +++ b/BamReader.h @@ -38,6 +38,8 @@ class BamReader { // close BAM file void Close(void); + // returns whether reader is open for reading or not + bool IsOpen(void) const; // performs random-access jump to reference, position bool Jump(int refID, int position = 0); // opens BAM file (and optional BAM index file, if provided) @@ -72,7 +74,7 @@ class BamReader { // returns number of reference sequences int GetReferenceCount(void) const; // returns vector of reference objects - const BamTools::RefVector GetReferenceData(void) const; + const BamTools::RefVector& GetReferenceData(void) const; // returns reference id (used for BamReader::Jump()) for the given reference name int GetReferenceID(const std::string& refName) const; // returns the name of the file associated with this BamReader @@ -83,8 +85,8 @@ class BamReader { // ---------------------- // creates index for BAM file, saves to file (default = bamFilename + ".bai") - bool CreateIndex(void); - + bool CreateIndex(bool useDefaultIndex = true); + // private implementation private: struct BamReaderPrivate; diff --git a/BamWriter.cpp b/BamWriter.cpp index 12a13e0..5ca9b7d 100644 --- a/BamWriter.cpp +++ b/BamWriter.cpp @@ -23,7 +23,6 @@ struct BamWriter::BamWriterPrivate { BgzfData mBGZF; bool IsBigEndian; - // constructor / destructor BamWriterPrivate(void) { IsBigEndian = SystemIsBigEndian();