{ }\r
};\r
\r
-// ----------------------------------------------------------------\r
-// Indexing structs & typedefs\r
-\r
-struct Chunk {\r
-\r
- // data members\r
- uint64_t Start;\r
- uint64_t Stop;\r
-\r
- // constructor\r
- Chunk(const uint64_t& start = 0, \r
- const uint64_t& stop = 0)\r
- : Start(start)\r
- , Stop(stop)\r
- { }\r
-};\r
-\r
-inline\r
-bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {\r
- return lhs.Start < rhs.Start;\r
-}\r
-\r
-typedef std::vector<Chunk> ChunkVector;\r
-typedef std::map<uint32_t, ChunkVector> BamBinMap;\r
-typedef std::vector<uint64_t> LinearOffsetVector;\r
-\r
-struct ReferenceIndex {\r
- // data members\r
- BamBinMap Bins;\r
- LinearOffsetVector Offsets;\r
- // constructor\r
- ReferenceIndex(const BamBinMap& binMap = BamBinMap(),\r
- const LinearOffsetVector& offsets = LinearOffsetVector())\r
- : Bins(binMap)\r
- , Offsets(offsets)\r
- { }\r
-};\r
-\r
-typedef std::vector<ReferenceIndex> BamIndex;\r
-\r
// ----------------------------------------------------------------\r
// BamAlignment member methods\r
\r
--- /dev/null
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include "BamIndex.h"
+#include "BamReader.h"
+#include "BGZF.h"
+using namespace std;
+using namespace BamTools;
+
+// -------------------------------
+// BamIndex implementation
+
+BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
+ : m_BGZF(bgzf)
+ , m_reader(reader)
+ , m_isBigEndian(isBigEndian)
+{
+ if ( m_reader && m_reader->IsOpen() )
+ m_references = m_reader->GetReferenceData();
+}
+
+bool BamIndex::HasAlignments(const int& referenceID) {
+
+ // return false if invalid ID
+ if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
+ return false;
+
+ // else return status of reference (has alignments?)
+ else
+ return m_references.at(referenceID).RefHasAlignments;
+}
+
+// #########################################################################################
+// #########################################################################################
+
+// -------------------------------
+// BamDefaultIndex structs & typedefs
+
+namespace BamTools {
+
+struct Chunk {
+
+ // data members
+ uint64_t Start;
+ uint64_t Stop;
+
+ // constructor
+ Chunk(const uint64_t& start = 0,
+ const uint64_t& stop = 0)
+ : Start(start)
+ , Stop(stop)
+ { }
+};
+
+bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
+ return lhs.Start < rhs.Start;
+}
+
+typedef vector<Chunk> ChunkVector;
+typedef map<uint32_t, ChunkVector> BamBinMap;
+typedef vector<uint64_t> LinearOffsetVector;
+
+struct ReferenceIndex {
+
+ // data members
+ BamBinMap Bins;
+ LinearOffsetVector Offsets;
+
+ // constructor
+ ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
+ const LinearOffsetVector& offsets = LinearOffsetVector())
+ : Bins(binMap)
+ , Offsets(offsets)
+ { }
+};
+
+typedef vector<ReferenceIndex> BamDefaultIndexData;
+
+} // namespace BamTools
+
+// -------------------------------
+// BamDefaultIndex implementation
+
+struct BamDefaultIndex::BamDefaultIndexPrivate {
+
+ // -------------------------
+ // data members
+
+ BamDefaultIndexData m_indexData;
+ BamDefaultIndex* m_parent;
+
+ // -------------------------
+ // ctor & dtor
+
+ BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { }
+ ~BamDefaultIndexPrivate(void) { }
+
+ // -------------------------
+ // internal methods
+
+ // calculate bins that overlap region
+ int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
+ // saves BAM bin entry for index
+ void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
+ // saves linear offset entry for index
+ void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
+ // simplifies index by merging 'chunks'
+ void MergeChunks(void);
+
+};
+
+BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
+ : BamIndex(bgzf, reader, isBigEndian)
+{
+ d = new BamDefaultIndexPrivate(this);
+}
+
+BamDefaultIndex::~BamDefaultIndex(void) {
+ d->m_indexData.clear();
+ delete d;
+ d = 0;
+}
+
+// calculate bins that overlap region
+int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
+
+ // get region boundaries
+ uint32_t begin = (unsigned int)region.LeftPosition;
+ uint32_t end;
+
+ // if right bound specified AND left&right bounds are on same reference
+ // OK to use right bound position
+ if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
+ end = (unsigned int)region.RightPosition;
+
+ // otherwise, use end of left bound reference as cutoff
+ else
+ end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
+
+ // initialize list, bin '0' always a valid bin
+ int i = 0;
+ bins[i++] = 0;
+
+ // get rest of bins that contain this region
+ unsigned int k;
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
+
+ // return number of bins stored
+ return i;
+}
+
+bool BamDefaultIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ m_reader->Rewind();
+
+ // get reference count, reserve index space
+ int numReferences = (int)m_references.size();
+ for ( int i = 0; i < numReferences; ++i ) {
+ d->m_indexData.push_back(ReferenceIndex());
+ }
+
+ // sets default constant for bin, ID, offset, coordinate variables
+ const uint32_t defaultValue = 0xffffffffu;
+
+ // bin data
+ uint32_t saveBin(defaultValue);
+ uint32_t lastBin(defaultValue);
+
+ // reference ID data
+ int32_t saveRefID(defaultValue);
+ int32_t lastRefID(defaultValue);
+
+ // offset data
+ uint64_t saveOffset = m_BGZF->Tell();
+ uint64_t lastOffset = saveOffset;
+
+ // coordinate data
+ int32_t lastCoordinate = defaultValue;
+
+ BamAlignment bAlignment;
+ while ( m_reader->GetNextAlignmentCore(bAlignment) ) {
+
+ // change of chromosome, save ID, reset bin
+ if ( lastRefID != bAlignment.RefID ) {
+ lastRefID = bAlignment.RefID;
+ lastBin = defaultValue;
+ }
+
+ // if lastCoordinate greater than BAM position - file not sorted properly
+ else if ( lastCoordinate > bAlignment.Position ) {
+ printf("BAM file not properly sorted:\n");
+ printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
+ exit(1);
+ }
+
+ // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
+ if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
+
+ // save linear offset entry (matched to BAM entry refID)
+ ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID);
+ LinearOffsetVector& offsets = refIndex.Offsets;
+ d->InsertLinearOffset(offsets, bAlignment, lastOffset);
+ }
+
+ // if current BamAlignment bin != lastBin, "then possibly write the binning index"
+ if ( bAlignment.Bin != lastBin ) {
+
+ // if not first time through
+ if ( saveBin != defaultValue ) {
+
+ // save Bam bin entry
+ ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
+ BamBinMap& binMap = refIndex.Bins;
+ d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // update saveOffset
+ saveOffset = lastOffset;
+
+ // update bin values
+ saveBin = bAlignment.Bin;
+ lastBin = bAlignment.Bin;
+
+ // update saveRefID
+ saveRefID = bAlignment.RefID;
+
+ // if invalid RefID, break out (why?)
+ if ( saveRefID < 0 ) { break; }
+ }
+
+ // make sure that current file pointer is beyond lastOffset
+ if ( m_BGZF->Tell() <= (int64_t)lastOffset ) {
+ printf("Error in BGZF offsets.\n");
+ exit(1);
+ }
+
+ // update lastOffset
+ lastOffset = m_BGZF->Tell();
+
+ // update lastCoordinate
+ lastCoordinate = bAlignment.Position;
+ }
+
+ // save any leftover BAM data (as long as refID is valid)
+ if ( saveRefID >= 0 ) {
+ // save Bam bin entry
+ ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
+ BamBinMap& binMap = refIndex.Bins;
+ d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // simplify index by merging chunks
+ d->MergeChunks();
+
+ // iterate through references in index
+ // store whether reference has data &
+ // sort offsets in linear offset vector
+ BamDefaultIndexData::iterator indexIter = d->m_indexData.begin();
+ BamDefaultIndexData::iterator indexEnd = d->m_indexData.end();
+ for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
+
+ // get reference index data
+ ReferenceIndex& refIndex = (*indexIter);
+ BamBinMap& binMap = refIndex.Bins;
+ LinearOffsetVector& offsets = refIndex.Offsets;
+
+ // store whether reference has alignments or no
+ m_references[i].RefHasAlignments = ( binMap.size() > 0 );
+
+ // sort linear offsets
+ sort(offsets.begin(), offsets.end());
+ }
+
+ // rewind file pointer to beginning of alignments, return success/fail
+ return m_reader->Rewind();
+}
+
+bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
+
+ // calculate which bins overlap this region
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+ int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins);
+
+ // get bins for this reference
+ const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID);
+ const BamBinMap& binMap = refIndex.Bins;
+
+ // get minimum offset to consider
+ const LinearOffsetVector& linearOffsets = refIndex.Offsets;
+ uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
+
+ // store all alignment 'chunk' starts (file offsets) for bins in this region
+ for ( int i = 0; i < numBins; ++i ) {
+
+ const uint16_t binKey = bins[i];
+ map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
+ if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
+
+ const ChunkVector& chunks = (*binIter).second;
+ std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
+ std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
+ for ( ; chunksIter != chunksEnd; ++chunksIter) {
+
+ // if valid chunk found, store its file offset
+ const Chunk& chunk = (*chunksIter);
+ if ( chunk.Stop > minOffset )
+ offsets.push_back( chunk.Start );
+ }
+ }
+ }
+
+ // clean up memory
+ free(bins);
+
+ // sort the offsets before returning
+ sort(offsets.begin(), offsets.end());
+
+ // return whether any offsets were found
+ return ( offsets.size() != 0 );
+}
+
+// saves BAM bin entry for index
+void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& binMap,
+ const uint32_t& saveBin,
+ const uint64_t& saveOffset,
+ const uint64_t& lastOffset)
+{
+ // look up saveBin
+ BamBinMap::iterator binIter = binMap.find(saveBin);
+
+ // create new chunk
+ Chunk newChunk(saveOffset, lastOffset);
+
+ // if entry doesn't exist
+ if ( binIter == binMap.end() ) {
+ ChunkVector newChunks;
+ newChunks.push_back(newChunk);
+ binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
+ }
+
+ // otherwise
+ else {
+ ChunkVector& binChunks = (*binIter).second;
+ binChunks.push_back( newChunk );
+ }
+}
+
+// saves linear offset entry for index
+void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
+ const BamAlignment& bAlignment,
+ const uint64_t& lastOffset)
+{
+ // get converted offsets
+ int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
+ int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
+
+ // resize vector if necessary
+ int oldSize = offsets.size();
+ int newSize = endOffset + 1;
+ if ( oldSize < newSize )
+ offsets.resize(newSize, 0);
+
+ // store offset
+ for( int i = beginOffset + 1; i <= endOffset; ++i ) {
+ if ( offsets[i] == 0 )
+ offsets[i] = lastOffset;
+ }
+}
+
+bool BamDefaultIndex::Load(const string& filename) {
+
+ // open index file, abort on error
+ FILE* indexStream = fopen(filename.c_str(), "rb");
+ if( !indexStream ) {
+ printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
+ return false;
+ }
+
+ // set placeholder to receive input byte count (suppresses compiler warnings)
+ size_t elementsRead = 0;
+
+ // see if index is valid BAM index
+ char magic[4];
+ elementsRead = fread(magic, 1, 4, indexStream);
+ if ( strncmp(magic, "BAI\1", 4) ) {
+ printf("Problem with index file - invalid format.\n");
+ fclose(indexStream);
+ return false;
+ }
+
+ // get number of reference sequences
+ uint32_t numRefSeqs;
+ elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); }
+
+ // intialize space for BamDefaultIndexData data structure
+ d->m_indexData.reserve(numRefSeqs);
+
+ // iterate over reference sequences
+ for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
+
+ // get number of bins for this reference sequence
+ int32_t numBins;
+ elementsRead = fread(&numBins, 4, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numBins); }
+
+ if ( numBins > 0 ) {
+ RefData& refEntry = m_references[i];
+ refEntry.RefHasAlignments = true;
+ }
+
+ // intialize BinVector
+ BamBinMap binMap;
+
+ // iterate over bins for that reference sequence
+ for ( int j = 0; j < numBins; ++j ) {
+
+ // get binID
+ uint32_t binID;
+ elementsRead = fread(&binID, 4, 1, indexStream);
+
+ // get number of regionChunks in this bin
+ uint32_t numChunks;
+ elementsRead = fread(&numChunks, 4, 1, indexStream);
+
+ if ( m_isBigEndian ) {
+ SwapEndian_32(binID);
+ SwapEndian_32(numChunks);
+ }
+
+ // intialize ChunkVector
+ ChunkVector regionChunks;
+ regionChunks.reserve(numChunks);
+
+ // iterate over regionChunks in this bin
+ for ( unsigned int k = 0; k < numChunks; ++k ) {
+
+ // get chunk boundaries (left, right)
+ uint64_t left;
+ uint64_t right;
+ elementsRead = fread(&left, 8, 1, indexStream);
+ elementsRead = fread(&right, 8, 1, indexStream);
+
+ if ( m_isBigEndian ) {
+ SwapEndian_64(left);
+ SwapEndian_64(right);
+ }
+
+ // save ChunkPair
+ regionChunks.push_back( Chunk(left, right) );
+ }
+
+ // sort chunks for this bin
+ sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
+
+ // save binID, chunkVector for this bin
+ binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
+ }
+
+ // load linear index for this reference sequence
+
+ // get number of linear offsets
+ int32_t numLinearOffsets;
+ elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numLinearOffsets); }
+
+ // intialize LinearOffsetVector
+ LinearOffsetVector offsets;
+ offsets.reserve(numLinearOffsets);
+
+ // iterate over linear offsets for this reference sequeence
+ uint64_t linearOffset;
+ for ( int j = 0; j < numLinearOffsets; ++j ) {
+ // read a linear offset & store
+ elementsRead = fread(&linearOffset, 8, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_64(linearOffset); }
+ offsets.push_back(linearOffset);
+ }
+
+ // sort linear offsets
+ sort( offsets.begin(), offsets.end() );
+
+ // store index data for that reference sequence
+ d->m_indexData.push_back( ReferenceIndex(binMap, offsets) );
+ }
+
+ // close index file (.bai) and return
+ fclose(indexStream);
+ return true;
+}
+
+// merges 'alignment chunks' in BAM bin (used for index building)
+void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) {
+
+ // iterate over reference enties
+ BamDefaultIndexData::iterator indexIter = m_indexData.begin();
+ BamDefaultIndexData::iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ // get BAM bin map for this reference
+ ReferenceIndex& refIndex = (*indexIter);
+ BamBinMap& bamBinMap = refIndex.Bins;
+
+ // iterate over BAM bins
+ BamBinMap::iterator binIter = bamBinMap.begin();
+ BamBinMap::iterator binEnd = bamBinMap.end();
+ for ( ; binIter != binEnd; ++binIter ) {
+
+ // get chunk vector for this bin
+ ChunkVector& binChunks = (*binIter).second;
+ if ( binChunks.size() == 0 ) { continue; }
+
+ ChunkVector mergedChunks;
+ mergedChunks.push_back( binChunks[0] );
+
+ // iterate over chunks
+ int i = 0;
+ ChunkVector::iterator chunkIter = binChunks.begin();
+ ChunkVector::iterator chunkEnd = binChunks.end();
+ for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
+
+ // get 'currentChunk' based on numeric index
+ Chunk& currentChunk = mergedChunks[i];
+
+ // get iteratorChunk based on vector iterator
+ Chunk& iteratorChunk = (*chunkIter);
+
+ // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
+ if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
+
+ // set currentChunk.Stop to iteratorChunk.Stop
+ currentChunk.Stop = iteratorChunk.Stop;
+ }
+
+ // otherwise
+ else {
+ // set currentChunk + 1 to iteratorChunk
+ mergedChunks.push_back(iteratorChunk);
+ ++i;
+ }
+ }
+
+ // saved merged chunk vector
+ (*binIter).second = mergedChunks;
+ }
+ }
+}
+
+// writes in-memory index data out to file
+// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+bool BamDefaultIndex::Write(const std::string& bamFilename) {
+
+ string indexFilename = bamFilename + ".bai";
+ FILE* indexStream = fopen(indexFilename.c_str(), "wb");
+ if ( indexStream == 0 ) {
+ printf("ERROR: Could not open file to save index.\n");
+ return false;
+ }
+
+ // write BAM index header
+ fwrite("BAI\1", 1, 4, indexStream);
+
+ // write number of reference sequences
+ int32_t numReferenceSeqs = d->m_indexData.size();
+ if ( m_isBigEndian ) { SwapEndian_32(numReferenceSeqs); }
+ fwrite(&numReferenceSeqs, 4, 1, indexStream);
+
+ // iterate over reference sequences
+ BamDefaultIndexData::const_iterator indexIter = d->m_indexData.begin();
+ BamDefaultIndexData::const_iterator indexEnd = d->m_indexData.end();
+ for ( ; indexIter != indexEnd; ++ indexIter ) {
+
+ // get reference index data
+ const ReferenceIndex& refIndex = (*indexIter);
+ const BamBinMap& binMap = refIndex.Bins;
+ const LinearOffsetVector& offsets = refIndex.Offsets;
+
+ // write number of bins
+ int32_t binCount = binMap.size();
+ if ( m_isBigEndian ) { SwapEndian_32(binCount); }
+ fwrite(&binCount, 4, 1, indexStream);
+
+ // iterate over bins
+ BamBinMap::const_iterator binIter = binMap.begin();
+ BamBinMap::const_iterator binEnd = binMap.end();
+ for ( ; binIter != binEnd; ++binIter ) {
+
+ // get bin data (key and chunk vector)
+ uint32_t binKey = (*binIter).first;
+ const ChunkVector& binChunks = (*binIter).second;
+
+ // save BAM bin key
+ if ( m_isBigEndian ) { SwapEndian_32(binKey); }
+ fwrite(&binKey, 4, 1, indexStream);
+
+ // save chunk count
+ int32_t chunkCount = binChunks.size();
+ if ( m_isBigEndian ) { SwapEndian_32(chunkCount); }
+ fwrite(&chunkCount, 4, 1, indexStream);
+
+ // iterate over chunks
+ ChunkVector::const_iterator chunkIter = binChunks.begin();
+ ChunkVector::const_iterator chunkEnd = binChunks.end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter ) {
+
+ // get current chunk data
+ const Chunk& chunk = (*chunkIter);
+ uint64_t start = chunk.Start;
+ uint64_t stop = chunk.Stop;
+
+ if ( m_isBigEndian ) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // save chunk offsets
+ fwrite(&start, 8, 1, indexStream);
+ fwrite(&stop, 8, 1, indexStream);
+ }
+ }
+
+ // write linear offsets size
+ int32_t offsetSize = offsets.size();
+ if ( m_isBigEndian ) { SwapEndian_32(offsetSize); }
+ fwrite(&offsetSize, 4, 1, indexStream);
+
+ // iterate over linear offsets
+ LinearOffsetVector::const_iterator offsetIter = offsets.begin();
+ LinearOffsetVector::const_iterator offsetEnd = offsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {
+
+ // write linear offset value
+ uint64_t linearOffset = (*offsetIter);
+ if ( m_isBigEndian ) { SwapEndian_64(linearOffset); }
+ fwrite(&linearOffset, 8, 1, indexStream);
+ }
+ }
+
+ // flush buffer, close file, and return success
+ fflush(indexStream);
+ fclose(indexStream);
+ return true;
+}
+
+// #########################################################################################
+// #########################################################################################
+
+// -------------------------------------
+// BamToolsIndex implementation
+
+namespace BamTools {
+
+struct BamToolsIndexEntry {
+
+ // data members
+ int64_t Offset;
+ int RefID;
+ int Position;
+
+ // ctor
+ BamToolsIndexEntry(const uint64_t& offset = 0,
+ const int& id = -1,
+ const int& position = -1)
+ : Offset(offset)
+ , RefID(id)
+ , Position(position)
+ { }
+};
+
+typedef vector<BamToolsIndexEntry> BamToolsIndexData;
+
+} // namespace BamTools
+
+struct BamToolsIndex::BamToolsIndexPrivate {
+
+ // -------------------------
+ // data members
+ BamToolsIndexData m_indexData;
+ BamToolsIndex* m_parent;
+ int32_t m_blockSize;
+
+ // -------------------------
+ // ctor & dtor
+
+ BamToolsIndexPrivate(BamToolsIndex* parent)
+ : m_parent(parent)
+ , m_blockSize(1000)
+ { }
+
+ ~BamToolsIndexPrivate(void) { }
+
+ // -------------------------
+ // internal methods
+};
+
+BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
+ : BamIndex(bgzf, reader, isBigEndian)
+{
+ d = new BamToolsIndexPrivate(this);
+}
+
+BamToolsIndex::~BamToolsIndex(void) {
+ delete d;
+ d = 0;
+}
+
+bool BamToolsIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ m_reader->Rewind();
+
+ // plow through alignments, store block offsets
+ int32_t currentBlockCount = 0;
+ int64_t blockStartOffset = m_BGZF->Tell();
+ int blockStartId = -1;
+ int blockStartPosition = -1;
+ BamAlignment al;
+ while ( m_reader->GetNextAlignmentCore(al) ) {
+
+ // set reference flag
+ m_references[al.RefID].RefHasAlignments = true;
+
+ // if beginning of block, save first alignment's refID & position
+ if ( currentBlockCount == 0 ) {
+ blockStartId = al.RefID;
+ blockStartPosition = al.Position;
+ }
+
+ // increment block counter
+ ++currentBlockCount;
+
+ // if block is full, get offset for next block, reset currentBlockCount
+ if ( currentBlockCount == d->m_blockSize ) {
+
+// cerr << "-------------------------------" << endl;
+// cerr << "BlockCount = " << currentBlockCount << endl;
+// cerr << endl;
+// cerr << "Storing entry: " << endl;
+// cerr << "\trefID : " << blockStartId << endl;
+// cerr << "\tpos : " << blockStartPosition << endl;
+// cerr << "\toffset : " << blockStartOffset << endl;
+//
+
+ d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) );
+ blockStartOffset = m_BGZF->Tell();
+ currentBlockCount = 0;
+ }
+ }
+
+ return m_reader->Rewind();
+}
+
+// N.B. - ignores isRightBoundSpecified
+bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
+
+ // return false if no index data present
+ if ( d->m_indexData.empty() ) return false;
+
+ // clear any prior data
+ offsets.clear();
+
+ // calculate nearest index to jump to
+ int64_t previousOffset = -1;
+ BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
+ BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ const BamToolsIndexEntry& entry = (*indexIter);
+
+ // check if we are 'past' beginning of desired region
+ // if so, we will break out & use previously stored offset
+ if ( entry.RefID > region.LeftRefID ) break;
+ if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break;
+
+ // not past desired region, so store current entry offset in previousOffset
+ previousOffset = entry.Offset;
+ }
+
+ // no index was found
+ if ( previousOffset == -1 )
+ return false;
+
+ // store offset & return success
+/* cerr << "BTI::GetOffsets() : calculated offset = " << previousOffset << endl;*/
+ offsets.push_back(previousOffset);
+ return true;
+}
+
+bool BamToolsIndex::Load(const string& filename) {
+
+ // open index file, abort on error
+ FILE* indexStream = fopen(filename.c_str(), "rb");
+ if( !indexStream ) {
+ printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
+ return false;
+ }
+
+ // set placeholder to receive input byte count (suppresses compiler warnings)
+ size_t elementsRead = 0;
+
+ // see if index is valid BAM index
+ char magic[4];
+ elementsRead = fread(magic, 1, 4, indexStream);
+ if ( strncmp(magic, "BTI\1", 4) ) {
+ printf("Problem with index file - invalid format.\n");
+ fclose(indexStream);
+ return false;
+ }
+
+ // read in block size
+ elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); }
+
+ // read in number of offsets
+ uint32_t numOffsets;
+ elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numOffsets); }
+
+ // reserve space for index data
+ d->m_indexData.reserve(numOffsets);
+
+ // iterate over index entries
+ for ( unsigned int i = 0; i < numOffsets; ++i ) {
+
+ uint64_t offset;
+ int id;
+ int position;
+
+ // read in data
+ elementsRead = fread(&offset, sizeof(offset), 1, indexStream);
+ elementsRead = fread(&id, sizeof(id), 1, indexStream);
+ elementsRead = fread(&position, sizeof(position), 1, indexStream);
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(offset);
+ SwapEndian_32(id);
+ SwapEndian_32(position);
+ }
+
+ // save reference index entry
+ d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) );
+
+ // set reference flag
+ m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag?
+ }
+
+ // close index file and return
+ fclose(indexStream);
+ return true;
+}
+
+// writes in-memory index data out to file
+// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+bool BamToolsIndex::Write(const std::string& bamFilename) {
+
+ string indexFilename = bamFilename + ".bti";
+ FILE* indexStream = fopen(indexFilename.c_str(), "wb");
+ if ( indexStream == 0 ) {
+ printf("ERROR: Could not open file to save index.\n");
+ return false;
+ }
+
+ // write BAM index header
+ fwrite("BTI\1", 1, 4, indexStream);
+
+ // write block size
+ int32_t blockSize = d->m_blockSize;
+ if ( m_isBigEndian ) { SwapEndian_32(blockSize); }
+ fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
+
+ // write number of offset entries
+ uint32_t numOffsets = d->m_indexData.size();
+ if ( m_isBigEndian ) { SwapEndian_32(numOffsets); }
+ fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
+
+ // iterate over offset entries
+ BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
+ BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
+ for ( ; indexIter != indexEnd; ++ indexIter ) {
+
+ // get reference index data
+ const BamToolsIndexEntry& entry = (*indexIter);
+
+ // copy entry data
+ uint64_t offset = entry.Offset;
+ int id = entry.RefID;
+ int position = entry.Position;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(offset);
+ SwapEndian_32(id);
+ SwapEndian_32(position);
+ }
+
+ // write the reference index entry
+ fwrite(&offset, sizeof(offset), 1, indexStream);
+ fwrite(&id, sizeof(id), 1, indexStream);
+ fwrite(&position, sizeof(position), 1, indexStream);
+ }
+
+ // flush file buffer, close file, and return success
+ fflush(indexStream);
+ fclose(indexStream);
+ return true;
+}
--- /dev/null
+#ifndef BAM_INDEX_H
+#define BAM_INDEX_H
+
+#include <string>
+#include <vector>
+#include "BamAux.h"
+
+namespace BamTools {
+
+class BamReader;
+class BgzfData;
+
+// BamIndex base class
+class BamIndex {
+
+ public:
+ BamIndex(BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ bool isBigEndian);
+ virtual ~BamIndex(void) { }
+
+ public:
+ // creates index data (in-memory) from current reader data
+ virtual bool Build(void) =0;
+ // calculates offset(s) for a given region
+ virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets) =0;
+ // loads existing data from file into memory
+ virtual bool Load(const std::string& filename) =0;
+ // returns whether reference has alignments or no
+ virtual bool HasAlignments(const int& referenceID);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ virtual bool Write(const std::string& bamFilename) =0;
+
+ protected:
+ BamTools::BgzfData* m_BGZF;
+ BamTools::BamReader* m_reader;
+ BamTools::RefVector m_references;
+ bool m_isBigEndian;
+};
+
+// BamDefaultIndex class
+//
+// implements default (per SAM/BAM spec) index file ops
+class BamDefaultIndex : public BamIndex {
+
+
+ // ctor & dtor
+ public:
+ BamDefaultIndex(BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ bool isBigEndian);
+ ~BamDefaultIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // calculates offset(s) for a given region
+ bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ bool Write(const std::string& bamFilename);
+
+ // internal implementation
+ private:
+ struct BamDefaultIndexPrivate;
+ BamDefaultIndexPrivate* d;
+};
+
+// BamToolsIndex class
+//
+// implements BamTools-specific index file ops
+class BamToolsIndex : public BamIndex {
+
+ // ctor & dtor
+ public:
+ BamToolsIndex(BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ bool isBigEndian);
+ ~BamToolsIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // calculates offset(s) for a given region
+ bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ bool Write(const std::string& bamFilename);
+
+ // internal implementation
+ private:
+ struct BamToolsIndexPrivate;
+ BamToolsIndexPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAM_INDEX_H
\ No newline at end of file
}
// opens BAM files
-void BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode) {
+void BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
+
// for filename in filenames
fileNames = filenames; // save filenames in our multireader
for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
string filename = *it;
BamReader* reader = new BamReader;
- // TODO; change Open to return success/failure so we can report errors here
+ bool openedOK = true;
if (openIndexes) {
- reader->Open(filename, filename + ".bai");
+ if (useDefaultIndex)
+ openedOK = reader->Open(filename, filename + ".bai");
+ else
+ openedOK = reader->Open(filename, filename + ".bti");
} else {
- reader->Open(filename); // for merging, jumping is disallowed
+ openedOK = reader->Open(filename); // for merging, jumping is disallowed
}
-
- bool fileOK = true;
- BamAlignment* alignment = new BamAlignment;
- if (coreMode) {
- fileOK &= reader->GetNextAlignmentCore(*alignment);
- } else {
- fileOK &= reader->GetNextAlignment(*alignment);
- }
- if (fileOK) {
- readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
- alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
- make_pair(reader, alignment)));
- } else {
- cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
+
+ // if file opened ok, check that it can be read
+ if ( openedOK ) {
+
+ bool fileOK = true;
+ BamAlignment* alignment = new BamAlignment;
+ if (coreMode) {
+ fileOK &= reader->GetNextAlignmentCore(*alignment);
+ } else {
+ fileOK &= reader->GetNextAlignment(*alignment);
+ }
+
+ if (fileOK) {
+ readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else {
+ cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
+ }
+
+ }
+
+ // TODO; error handling on openedOK == false
+ else {
+
+
}
-
}
+
ValidateReaders();
}
}
// saves index data to BAM index files (".bai") where necessary, returns success/fail
-bool BamMultiReader::CreateIndexes(void) {
+bool BamMultiReader::CreateIndexes(bool useDefaultIndex) {
bool result = true;
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
- result &= reader->CreateIndex();
+ result &= reader->CreateIndex(useDefaultIndex);
}
return result;
}
// indexes.\r
// @coreMode - setup our first alignments using GetNextAlignmentCore();\r
// also useful for merging\r
- void Open(const vector<string> filenames, bool openIndexes = true, bool coreMode = false);\r
+ void Open(const vector<string> filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true);\r
\r
// performs random-access jump to reference, position\r
bool Jump(int refID, int position = 0);\r
// ----------------------\r
\r
// creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")\r
- bool CreateIndexes(void);\r
+ bool CreateIndexes(bool useDefaultIndex = true);\r
\r
//const int GetReferenceID(const string& refName) const;\r
\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 22 June 2010 (DB)\r
+// Last modified: 30 June 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
// BamTools includes\r
#include "BGZF.h"\r
#include "BamReader.h"\r
+#include "BamIndex.h"\r
using namespace BamTools;\r
using namespace std;\r
\r
// structs, enums, typedefs\r
// -------------------------------\r
enum RegionState { BEFORE_REGION = 0\r
- , WITHIN_REGION\r
- , AFTER_REGION\r
- };\r
+ , WITHIN_REGION\r
+ , AFTER_REGION\r
+ };\r
\r
// -------------------------------\r
// data members\r
// general file data\r
BgzfData mBGZF;\r
string HeaderText;\r
- BamIndex Index;\r
+ //BamIndex Index;\r
+ BamIndex* NewIndex;\r
RefVector References;\r
bool IsIndexLoaded;\r
int64_t AlignmentsBeginOffset;\r
int CurrentRefID;\r
int CurrentLeft;\r
\r
+ // parent BamReader\r
+ BamReader* Parent;\r
+ \r
// BAM character constants\r
const char* DNA_LOOKUP;\r
const char* CIGAR_LOOKUP;\r
// -------------------------------\r
// constructor & destructor\r
// -------------------------------\r
- BamReaderPrivate(void);\r
+ BamReaderPrivate(BamReader* parent);\r
~BamReaderPrivate(void);\r
\r
// -------------------------------\r
int GetReferenceID(const string& refName) const;\r
\r
// index operations\r
- bool CreateIndex(void);\r
+ bool CreateIndex(bool useDefaultIndex);\r
\r
// -------------------------------\r
// internal methods\r
\r
// *** reading alignments and auxiliary data *** //\r
\r
- // calculate bins that overlap region\r
- int BinsFromRegion(uint16_t bins[MAX_BIN]);\r
// fills out character data for BamAlignment data\r
bool BuildCharData(BamAlignment& bAlignment);\r
- // calculate file offset for first alignment chunk overlapping specified region\r
- int64_t GetOffset(std::vector<int64_t>& chunkStarts);\r
// checks to see if alignment overlaps current region\r
RegionState IsOverlap(BamAlignment& bAlignment);\r
// retrieves header text from BAM file\r
\r
// *** index file handling *** //\r
\r
- // calculates index for BAM file\r
- bool BuildIndex(void);\r
// clear out inernal index data structure\r
void ClearIndex(void);\r
- // saves BAM bin entry for index\r
- void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);\r
- // saves linear offset entry for index\r
- void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);\r
// loads index from BAM index file\r
bool LoadIndex(void);\r
- // simplifies index by merging 'chunks'\r
- void MergeChunks(void);\r
- // saves index to BAM index file\r
- bool WriteIndex(void);\r
};\r
\r
// -----------------------------------------------------\r
// BamReader implementation (wrapper around BRPrivate)\r
// -----------------------------------------------------\r
-\r
// constructor\r
BamReader::BamReader(void) {\r
- d = new BamReaderPrivate;\r
+ d = new BamReaderPrivate(this);\r
}\r
\r
// destructor\r
\r
// file operations\r
void BamReader::Close(void) { d->Close(); }\r
+bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
bool BamReader::Jump(int refID, int position) { \r
d->Region.LeftRefID = refID;\r
d->Region.LeftPosition = position;\r
// access auxiliary data\r
const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
-const RefVector BamReader::GetReferenceData(void) const { return d->References; }\r
+const RefVector& BamReader::GetReferenceData(void) const { return d->References; }\r
int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
\r
// index operations\r
-bool BamReader::CreateIndex(void) { return d->CreateIndex(); }\r
+bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); }\r
\r
// -----------------------------------------------------\r
// BamReaderPrivate implementation\r
// -----------------------------------------------------\r
\r
// constructor\r
-BamReader::BamReaderPrivate::BamReaderPrivate(void)\r
- : IsIndexLoaded(false)\r
+BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)\r
+ : NewIndex(0)\r
+ , IsIndexLoaded(false)\r
, AlignmentsBeginOffset(0)\r
, IsLeftBoundSpecified(false)\r
, IsRightBoundSpecified(false)\r
, IsRegionSpecified(false)\r
, CurrentRefID(0)\r
, CurrentLeft(0)\r
+ , Parent(parent)\r
, DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
, CIGAR_LOOKUP("MIDNSHP")\r
{ \r
Close();\r
}\r
\r
-// calculate bins that overlap region\r
-int BamReader::BamReaderPrivate::BinsFromRegion(uint16_t list[MAX_BIN]) {\r
-\r
- // get region boundaries\r
- uint32_t begin = (unsigned int)Region.LeftPosition;\r
- uint32_t end;\r
- \r
- // if right bound specified AND left&right bounds are on same reference\r
- // OK to use right bound position\r
- if ( IsRightBoundSpecified && ( Region.LeftRefID == Region.RightRefID ) )\r
- end = (unsigned int)Region.RightPosition; // -1 ??\r
- \r
- // otherwise, use end of left bound reference as cutoff\r
- else\r
- end = (unsigned int)References.at(Region.LeftRefID).RefLength - 1;\r
- \r
- // initialize list, bin '0' always a valid bin\r
- int i = 0;\r
- list[i++] = 0;\r
-\r
- // get rest of bins that contain this region\r
- unsigned int k;\r
- for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }\r
- for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }\r
- for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }\r
- for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }\r
- for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
-\r
- // return number of bins stored\r
- return i;\r
-}\r
-\r
bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
\r
// calculate character lengths/offsets\r
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
- const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
- const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
- const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
- const unsigned int tagDataLength = dataLength - tagDataOffset;\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
+ const unsigned int tagDataLength = dataLength - tagDataOffset;\r
\r
// set up char buffers\r
- const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
- const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
- const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
- char* tagData = ((char*)allCharData) + tagDataOffset;\r
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
+ const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
+ char* tagData = ((char*)allCharData) + tagDataOffset;\r
\r
+ // store alignment name (depends on null char as terminator)\r
+ bAlignment.Name.assign((const char*)(allCharData)); \r
+ \r
+ // save CigarOps \r
+ CigarOp op;\r
+ bAlignment.CigarData.clear();\r
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
+\r
+ // swap if necessary\r
+ if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+ \r
+ // build CigarOp structure\r
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+ // save CigarOp\r
+ bAlignment.CigarData.push_back(op);\r
+ }\r
+ \r
+ \r
// save query sequence\r
bAlignment.QueryBases.clear();\r
bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
return true;\r
}\r
\r
-// populates BAM index data structure from BAM file data\r
-bool BamReader::BamReaderPrivate::BuildIndex(void) {\r
-\r
- // check to be sure file is open\r
- if (!mBGZF.IsOpen) { return false; }\r
-\r
- // move file pointer to beginning of alignments\r
- Rewind();\r
-\r
- // get reference count, reserve index space\r
- int numReferences = References.size();\r
- for ( int i = 0; i < numReferences; ++i ) {\r
- Index.push_back(ReferenceIndex());\r
- }\r
-\r
- // sets default constant for bin, ID, offset, coordinate variables\r
- const uint32_t defaultValue = 0xffffffffu;\r
-\r
- // bin data\r
- uint32_t saveBin(defaultValue);\r
- uint32_t lastBin(defaultValue);\r
-\r
- // reference ID data\r
- int32_t saveRefID(defaultValue);\r
- int32_t lastRefID(defaultValue);\r
-\r
- // offset data\r
- uint64_t saveOffset = mBGZF.Tell();\r
- uint64_t lastOffset = saveOffset;\r
-\r
- // coordinate data\r
- int32_t lastCoordinate = defaultValue;\r
-\r
- BamAlignment bAlignment;\r
- while( GetNextAlignment(bAlignment) ) {\r
-\r
- // change of chromosome, save ID, reset bin\r
- if ( lastRefID != bAlignment.RefID ) {\r
- lastRefID = bAlignment.RefID;\r
- lastBin = defaultValue;\r
- }\r
-\r
- // if lastCoordinate greater than BAM position - file not sorted properly\r
- else if ( lastCoordinate > bAlignment.Position ) {\r
- printf("BAM file not properly sorted:\n");\r
- printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);\r
- exit(1);\r
- }\r
-\r
- // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)\r
- if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {\r
-\r
- // save linear offset entry (matched to BAM entry refID)\r
- ReferenceIndex& refIndex = Index.at(bAlignment.RefID);\r
- LinearOffsetVector& offsets = refIndex.Offsets;\r
- InsertLinearOffset(offsets, bAlignment, lastOffset);\r
- }\r
-\r
- // if current BamAlignment bin != lastBin, "then possibly write the binning index"\r
- if ( bAlignment.Bin != lastBin ) {\r
-\r
- // if not first time through\r
- if ( saveBin != defaultValue ) {\r
-\r
- // save Bam bin entry\r
- ReferenceIndex& refIndex = Index.at(saveRefID);\r
- BamBinMap& binMap = refIndex.Bins;\r
- InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
- }\r
-\r
- // update saveOffset\r
- saveOffset = lastOffset;\r
-\r
- // update bin values\r
- saveBin = bAlignment.Bin;\r
- lastBin = bAlignment.Bin;\r
-\r
- // update saveRefID\r
- saveRefID = bAlignment.RefID;\r
-\r
- // if invalid RefID, break out (why?)\r
- if ( saveRefID < 0 ) { break; }\r
- }\r
-\r
- // make sure that current file pointer is beyond lastOffset\r
- if ( mBGZF.Tell() <= (int64_t)lastOffset ) {\r
- printf("Error in BGZF offsets.\n");\r
- exit(1);\r
- }\r
-\r
- // update lastOffset\r
- lastOffset = mBGZF.Tell();\r
-\r
- // update lastCoordinate\r
- lastCoordinate = bAlignment.Position;\r
- }\r
-\r
- // save any leftover BAM data (as long as refID is valid)\r
- if ( saveRefID >= 0 ) {\r
- // save Bam bin entry\r
- ReferenceIndex& refIndex = Index.at(saveRefID);\r
- BamBinMap& binMap = refIndex.Bins;\r
- InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
- }\r
-\r
- // simplify index by merging chunks\r
- MergeChunks();\r
-\r
- // iterate over references\r
- BamIndex::iterator indexIter = Index.begin();\r
- BamIndex::iterator indexEnd = Index.end();\r
- for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {\r
-\r
- // get reference index data\r
- ReferenceIndex& refIndex = (*indexIter);\r
- BamBinMap& binMap = refIndex.Bins;\r
- LinearOffsetVector& offsets = refIndex.Offsets;\r
-\r
- // store whether reference has alignments or no\r
- References[i].RefHasAlignments = ( binMap.size() > 0 );\r
-\r
- // sort linear offsets\r
- sort(offsets.begin(), offsets.end());\r
- }\r
-\r
-\r
- // rewind file pointer to beginning of alignments, return success/fail\r
- return Rewind();\r
-}\r
-\r
-\r
// clear index data structure\r
void BamReader::BamReaderPrivate::ClearIndex(void) {\r
- Index.clear(); // sufficient ??\r
+ delete NewIndex;\r
+ NewIndex = 0;\r
}\r
\r
// closes the BAM file\r
void BamReader::BamReaderPrivate::Close(void) {\r
+ \r
+ // close BGZF file stream\r
mBGZF.Close();\r
+ \r
+ // clear out index data\r
ClearIndex();\r
+ \r
+ // clear out header data\r
HeaderText.clear();\r
- IsLeftBoundSpecified = false;\r
+ \r
+ // clear out region flags\r
+ IsLeftBoundSpecified = false;\r
IsRightBoundSpecified = false;\r
- IsRegionSpecified = false;\r
+ IsRegionSpecified = false;\r
}\r
\r
// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
-bool BamReader::BamReaderPrivate::CreateIndex(void) {\r
+bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) {\r
\r
- // clear out index\r
+ // clear out prior index data\r
ClearIndex();\r
-\r
- // build (& save) index from BAM file\r
+ \r
+ // create default index\r
+ if ( useDefaultIndex )\r
+ NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ // create BamTools 'custom' index\r
+ else\r
+ NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
bool ok = true;\r
- ok &= BuildIndex();\r
- ok &= WriteIndex();\r
-\r
+ ok &= NewIndex->Build();\r
+ ok &= NewIndex->Write(Filename); \r
+ \r
// return success/fail\r
return ok;\r
}\r
return false;\r
}\r
\r
-// calculate file offset for first alignment chunk overlapping specified region\r
-int64_t BamReader::BamReaderPrivate::GetOffset(std::vector<int64_t>& chunkStarts) {\r
-\r
- // calculate which bins overlap this region\r
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);\r
- int numBins = BinsFromRegion(bins);\r
-\r
- // get bins for this reference\r
- const ReferenceIndex& refIndex = Index.at(Region.LeftRefID);\r
- const BamBinMap& binMap = refIndex.Bins;\r
-\r
- // get minimum offset to consider\r
- const LinearOffsetVector& offsets = refIndex.Offsets;\r
- uint64_t minOffset = ( (unsigned int)(Region.LeftPosition>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(Region.LeftPosition>>BAM_LIDX_SHIFT);\r
-\r
- // store offsets to beginning of alignment 'chunks'\r
- //std::vector<int64_t> chunkStarts;\r
-\r
- // store all alignment 'chunk' starts for bins in this region\r
- for (int i = 0; i < numBins; ++i ) {\r
- \r
- const uint16_t binKey = bins[i];\r
- map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);\r
- if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {\r
-\r
- const ChunkVector& chunks = (*binIter).second;\r
- std::vector<Chunk>::const_iterator chunksIter = chunks.begin();\r
- std::vector<Chunk>::const_iterator chunksEnd = chunks.end();\r
- for ( ; chunksIter != chunksEnd; ++chunksIter) {\r
- const Chunk& chunk = (*chunksIter);\r
- if ( chunk.Stop > minOffset ) {\r
- chunkStarts.push_back( chunk.Start );\r
- }\r
- }\r
- }\r
- }\r
-\r
- // clean up memory\r
- free(bins);\r
-\r
- // if no alignments found, else return smallest offset for alignment starts\r
- if ( chunkStarts.size() == 0 ) { return -1; }\r
- else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }\r
-}\r
-\r
// returns RefID for given RefName (returns References.size() if not found)\r
int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
\r
return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
}\r
\r
-// saves BAM bin entry for index\r
-void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap,\r
- const uint32_t& saveBin,\r
- const uint64_t& saveOffset,\r
- const uint64_t& lastOffset)\r
-{\r
- // look up saveBin\r
- BamBinMap::iterator binIter = binMap.find(saveBin);\r
-\r
- // create new chunk\r
- Chunk newChunk(saveOffset, lastOffset);\r
-\r
- // if entry doesn't exist\r
- if ( binIter == binMap.end() ) {\r
- ChunkVector newChunks;\r
- newChunks.push_back(newChunk);\r
- binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));\r
- }\r
-\r
- // otherwise\r
- else {\r
- ChunkVector& binChunks = (*binIter).second;\r
- binChunks.push_back( newChunk );\r
- }\r
-}\r
-\r
-// saves linear offset entry for index\r
-void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets,\r
- const BamAlignment& bAlignment,\r
- const uint64_t& lastOffset)\r
-{\r
- // get converted offsets\r
- int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;\r
- int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;\r
-\r
- // resize vector if necessary\r
- int oldSize = offsets.size();\r
- int newSize = endOffset + 1;\r
- if ( oldSize < newSize ) { offsets.resize(newSize, 0); }\r
-\r
- // store offset\r
- for(int i = beginOffset + 1; i <= endOffset ; ++i) {\r
- if ( offsets[i] == 0) {\r
- offsets[i] = lastOffset;\r
- }\r
- }\r
-}\r
-\r
// returns region state - whether alignment ends before, overlaps, or starts after currently specified region\r
// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true\r
BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
bool BamReader::BamReaderPrivate::Jump(int refID, int position) {\r
\r
- // if data exists for this reference and position is valid \r
- if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {\r
-\r
- // calculate offset\r
- std::vector<int64_t> chunkStarts;\r
- int64_t offset = GetOffset(chunkStarts);\r
- sort(chunkStarts.begin(), chunkStarts.end());\r
-\r
- // if in valid offset, return failure\r
- // otherwise return success of seek operation\r
- if ( offset == -1 ) {\r
- return false;\r
- } else {\r
- //return mBGZF.Seek(offset);\r
- BamAlignment bAlignment;\r
- bool result = true;\r
- for (std::vector<int64_t>::const_iterator o = chunkStarts.begin(); o != chunkStarts.end(); ++o) {\r
- // std::cerr << *o << endl;\r
- // std::cerr << "Seeking to offset: " << *o << endl;\r
- result &= mBGZF.Seek(*o);\r
- LoadNextAlignment(bAlignment);\r
- // std::cerr << "alignment: " << bAlignment.RefID \r
- // << ":" << bAlignment.Position << ".." << bAlignment.Position + bAlignment.Length << endl;\r
- if ((bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || bAlignment.RefID > refID) {\r
- // std::cerr << "here i am" << endl;\r
- // std::cerr << "seeking to offset: " << (*(o-1)) << endl;\r
- // std::cerr << "*** Finished jumping ***" << endl;\r
- return mBGZF.Seek(*o);\r
- }\r
- }\r
-\r
- //std::cerr << "*** Finished jumping ***" << endl;\r
- return result;\r
- }\r
+ // -----------------------------------------------------------------------\r
+ // check for existing index \r
+ if ( NewIndex == 0 ) return false; \r
+ // see if reference has alignments\r
+ if ( !NewIndex->HasAlignments(refID) ) return false; \r
+ // make sure position is valid\r
+ if ( position > References.at(refID).RefLength ) return false;\r
+ \r
+ // determine possible offsets\r
+ vector<int64_t> offsets;\r
+ if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) {\r
+ printf("ERROR: Could not jump: unable to calculate offset for specified region.\n");\r
+ return false;\r
}\r
-\r
- // invalid jump request parameters, return failure\r
- return false;\r
+ \r
+ // iterate through offsets\r
+ BamAlignment bAlignment;\r
+ bool result = true;\r
+ for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {\r
+ \r
+ // attempt seek & load first available alignment\r
+ result &= mBGZF.Seek(*o);\r
+ LoadNextAlignment(bAlignment);\r
+ \r
+ // if this alignment corresponds to desired position\r
+ // return success of seeking back to 'current offset'\r
+ if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) )\r
+ return mBGZF.Seek(*o);\r
+ }\r
+ \r
+ return result;\r
}\r
\r
// load BAM header data\r
// load existing index data from BAM index file (".bai"), return success/fail\r
bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
\r
- // clear out index data\r
+ // clear out any existing index data\r
ClearIndex();\r
\r
// skip if index file empty\r
- if ( IndexFilename.empty() ) { return false; }\r
-\r
- // open index file, abort on error\r
- FILE* indexStream = fopen(IndexFilename.c_str(), "rb");\r
- if(!indexStream) {\r
- printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() );\r
+ if ( IndexFilename.empty() )\r
return false;\r
- }\r
\r
- size_t elementsRead = 0;\r
- \r
- // see if index is valid BAM index\r
- char magic[4];\r
- elementsRead = fread(magic, 1, 4, indexStream);\r
- if (strncmp(magic, "BAI\1", 4)) {\r
- printf("Problem with index file - invalid format.\n");\r
- fclose(indexStream);\r
+ // check supplied filename for index type\r
+ size_t defaultExtensionFound = IndexFilename.find(".bai");\r
+ size_t customExtensionFound = IndexFilename.find(".bti");\r
+ \r
+ // if SAM/BAM default (".bai")\r
+ if ( defaultExtensionFound != string::npos )\r
+ NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
+ // if BamTools custom index (".bti")\r
+ else if ( customExtensionFound != string::npos )\r
+ NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
+ // else unknown\r
+ else {\r
+ printf("ERROR: Unknown index file extension.\n");\r
return false;\r
}\r
-\r
- // get number of reference sequences\r
- uint32_t numRefSeqs;\r
- elementsRead = fread(&numRefSeqs, 4, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); }\r
\r
- // intialize space for BamIndex data structure\r
- Index.reserve(numRefSeqs);\r
-\r
- // iterate over reference sequences\r
- for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
-\r
- // get number of bins for this reference sequence\r
- int32_t numBins;\r
- elementsRead = fread(&numBins, 4, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_32(numBins); }\r
-\r
- if (numBins > 0) {\r
- RefData& refEntry = References[i];\r
- refEntry.RefHasAlignments = true;\r
- }\r
-\r
- // intialize BinVector\r
- BamBinMap binMap;\r
-\r
- // iterate over bins for that reference sequence\r
- for (int j = 0; j < numBins; ++j) {\r
-\r
- // get binID\r
- uint32_t binID;\r
- elementsRead = fread(&binID, 4, 1, indexStream);\r
-\r
- // get number of regionChunks in this bin\r
- uint32_t numChunks;\r
- elementsRead = fread(&numChunks, 4, 1, indexStream);\r
-\r
- if ( IsBigEndian ) { \r
- SwapEndian_32(binID);\r
- SwapEndian_32(numChunks);\r
- }\r
- \r
- // intialize ChunkVector\r
- ChunkVector regionChunks;\r
- regionChunks.reserve(numChunks);\r
-\r
- // iterate over regionChunks in this bin\r
- for (unsigned int k = 0; k < numChunks; ++k) {\r
-\r
- // get chunk boundaries (left, right)\r
- uint64_t left;\r
- uint64_t right;\r
- elementsRead = fread(&left, 8, 1, indexStream);\r
- elementsRead = fread(&right, 8, 1, indexStream);\r
-\r
- if ( IsBigEndian ) {\r
- SwapEndian_64(left);\r
- SwapEndian_64(right);\r
- }\r
- \r
- // save ChunkPair\r
- regionChunks.push_back( Chunk(left, right) );\r
- }\r
-\r
- // sort chunks for this bin\r
- sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );\r
-\r
- // save binID, chunkVector for this bin\r
- binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );\r
- }\r
-\r
- // load linear index for this reference sequence\r
-\r
- // get number of linear offsets\r
- int32_t numLinearOffsets;\r
- elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); }\r
-\r
- // intialize LinearOffsetVector\r
- LinearOffsetVector offsets;\r
- offsets.reserve(numLinearOffsets);\r
-\r
- // iterate over linear offsets for this reference sequeence\r
- uint64_t linearOffset;\r
- for (int j = 0; j < numLinearOffsets; ++j) {\r
- // read a linear offset & store\r
- elementsRead = fread(&linearOffset, 8, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
- offsets.push_back(linearOffset);\r
- }\r
-\r
- // sort linear offsets\r
- sort( offsets.begin(), offsets.end() );\r
-\r
- // store index data for that reference sequence\r
- Index.push_back( ReferenceIndex(binMap, offsets) );\r
- }\r
-\r
- // close index file (.bai) and return\r
- fclose(indexStream);\r
- return true;\r
+ // return success of loading index data\r
+ return NewIndex->Load(IndexFilename);\r
}\r
\r
// populates BamAlignment with alignment data under file pointer, returns success/fail\r
bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
\r
- // store 'all char data' and cigar ops\r
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
- const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
- \r
- char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ // set BamAlignment length\r
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
\r
// read in character data - make sure proper data size was read\r
- if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; }\r
- else {\r
- \r
- // store alignment name and length\r
- bAlignment.Name.assign((const char*)(allCharData));\r
- bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
+ bool readCharDataOK = false;\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
+ \r
+ if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { \r
\r
- // store remaining 'allCharData' in supportData structure\r
+ // store 'allCharData' in supportData structure\r
bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
\r
- // save CigarOps for BamAlignment\r
- CigarOp op;\r
- bAlignment.CigarData.clear();\r
- bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
-\r
- // swap if necessary\r
- if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
- \r
- // build CigarOp structure\r
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
-\r
- // save CigarOp\r
- bAlignment.CigarData.push_back(op);\r
- }\r
+ // set success flag\r
+ readCharDataOK = true;\r
}\r
\r
free(allCharData);\r
- return true;\r
+ return readCharDataOK;\r
}\r
\r
// loads reference data from BAM file\r
}\r
}\r
\r
-// merges 'alignment chunks' in BAM bin (used for index building)\r
-void BamReader::BamReaderPrivate::MergeChunks(void) {\r
-\r
- // iterate over reference enties\r
- BamIndex::iterator indexIter = Index.begin();\r
- BamIndex::iterator indexEnd = Index.end();\r
- for ( ; indexIter != indexEnd; ++indexIter ) {\r
-\r
- // get BAM bin map for this reference\r
- ReferenceIndex& refIndex = (*indexIter);\r
- BamBinMap& bamBinMap = refIndex.Bins;\r
-\r
- // iterate over BAM bins\r
- BamBinMap::iterator binIter = bamBinMap.begin();\r
- BamBinMap::iterator binEnd = bamBinMap.end();\r
- for ( ; binIter != binEnd; ++binIter ) {\r
-\r
- // get chunk vector for this bin\r
- ChunkVector& binChunks = (*binIter).second;\r
- if ( binChunks.size() == 0 ) { continue; }\r
-\r
- ChunkVector mergedChunks;\r
- mergedChunks.push_back( binChunks[0] );\r
-\r
- // iterate over chunks\r
- int i = 0;\r
- ChunkVector::iterator chunkIter = binChunks.begin();\r
- ChunkVector::iterator chunkEnd = binChunks.end();\r
- for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {\r
-\r
- // get 'currentChunk' based on numeric index\r
- Chunk& currentChunk = mergedChunks[i];\r
-\r
- // get iteratorChunk based on vector iterator\r
- Chunk& iteratorChunk = (*chunkIter);\r
-\r
- // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)\r
- if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {\r
-\r
- // set currentChunk.Stop to iteratorChunk.Stop\r
- currentChunk.Stop = iteratorChunk.Stop;\r
- }\r
-\r
- // otherwise\r
- else {\r
- // set currentChunk + 1 to iteratorChunk\r
- mergedChunks.push_back(iteratorChunk);\r
- ++i;\r
- }\r
- }\r
-\r
- // saved merged chunk vector\r
- (*binIter).second = mergedChunks;\r
- }\r
- }\r
-}\r
-\r
// opens BAM file (and index)\r
bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
\r
AlignmentsBeginOffset = mBGZF.Tell();\r
\r
// open index file & load index data (if exists)\r
- if ( !IndexFilename.empty() ) {\r
+ if ( !IndexFilename.empty() )\r
LoadIndex();\r
- }\r
\r
// return success\r
return true;\r
\r
// returns BAM file pointer to beginning of alignment data\r
bool BamReader::BamReaderPrivate::Rewind(void) {\r
-\r
+ \r
// find first reference that has alignments in the BAM file\r
int refID = 0;\r
int refCount = References.size();\r
for ( ; refID < refCount; ++refID ) {\r
- if ( References.at(refID).RefHasAlignments ) { break; }\r
+ if ( References.at(refID).RefHasAlignments ) \r
+ break;\r
}\r
\r
// reset default region info\r
// attempt jump to beginning of region, return success/fail of Jump()\r
return Jump( Region.LeftRefID, Region.LeftPosition );\r
}\r
-\r
-// saves index data to BAM index file (".bai"), returns success/fail\r
-bool BamReader::BamReaderPrivate::WriteIndex(void) {\r
-\r
- IndexFilename = Filename + ".bai";\r
- FILE* indexStream = fopen(IndexFilename.c_str(), "wb");\r
- if ( indexStream == 0 ) {\r
- printf("ERROR: Could not open file to save index\n");\r
- return false;\r
- }\r
-\r
- // write BAM index header\r
- fwrite("BAI\1", 1, 4, indexStream);\r
-\r
- // write number of reference sequences\r
- int32_t numReferenceSeqs = Index.size();\r
- if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); }\r
- fwrite(&numReferenceSeqs, 4, 1, indexStream);\r
-\r
- // iterate over reference sequences\r
- BamIndex::const_iterator indexIter = Index.begin();\r
- BamIndex::const_iterator indexEnd = Index.end();\r
- for ( ; indexIter != indexEnd; ++ indexIter ) {\r
-\r
- // get reference index data\r
- const ReferenceIndex& refIndex = (*indexIter);\r
- const BamBinMap& binMap = refIndex.Bins;\r
- const LinearOffsetVector& offsets = refIndex.Offsets;\r
-\r
- // write number of bins\r
- int32_t binCount = binMap.size();\r
- if ( IsBigEndian ) { SwapEndian_32(binCount); }\r
- fwrite(&binCount, 4, 1, indexStream);\r
-\r
- // iterate over bins\r
- BamBinMap::const_iterator binIter = binMap.begin();\r
- BamBinMap::const_iterator binEnd = binMap.end();\r
- for ( ; binIter != binEnd; ++binIter ) {\r
-\r
- // get bin data (key and chunk vector)\r
- uint32_t binKey = (*binIter).first;\r
- const ChunkVector& binChunks = (*binIter).second;\r
-\r
- // save BAM bin key\r
- if ( IsBigEndian ) { SwapEndian_32(binKey); }\r
- fwrite(&binKey, 4, 1, indexStream);\r
-\r
- // save chunk count\r
- int32_t chunkCount = binChunks.size();\r
- if ( IsBigEndian ) { SwapEndian_32(chunkCount); }\r
- fwrite(&chunkCount, 4, 1, indexStream);\r
-\r
- // iterate over chunks\r
- ChunkVector::const_iterator chunkIter = binChunks.begin();\r
- ChunkVector::const_iterator chunkEnd = binChunks.end();\r
- for ( ; chunkIter != chunkEnd; ++chunkIter ) {\r
-\r
- // get current chunk data\r
- const Chunk& chunk = (*chunkIter);\r
- uint64_t start = chunk.Start;\r
- uint64_t stop = chunk.Stop;\r
-\r
- if ( IsBigEndian ) {\r
- SwapEndian_64(start);\r
- SwapEndian_64(stop);\r
- }\r
- \r
- // save chunk offsets\r
- fwrite(&start, 8, 1, indexStream);\r
- fwrite(&stop, 8, 1, indexStream);\r
- }\r
- }\r
-\r
- // write linear offsets size\r
- int32_t offsetSize = offsets.size();\r
- if ( IsBigEndian ) { SwapEndian_32(offsetSize); }\r
- fwrite(&offsetSize, 4, 1, indexStream);\r
-\r
- // iterate over linear offsets\r
- LinearOffsetVector::const_iterator offsetIter = offsets.begin();\r
- LinearOffsetVector::const_iterator offsetEnd = offsets.end();\r
- for ( ; offsetIter != offsetEnd; ++offsetIter ) {\r
-\r
- // write linear offset value\r
- uint64_t linearOffset = (*offsetIter);\r
- if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
- fwrite(&linearOffset, 8, 1, indexStream);\r
- }\r
- }\r
-\r
- // flush buffer, close file, and return success\r
- fflush(indexStream);\r
- fclose(indexStream);\r
- return true;\r
-}\r
\r
// close BAM file\r
void Close(void);\r
+ // returns whether reader is open for reading or not\r
+ bool IsOpen(void) const;\r
// performs random-access jump to reference, position\r
bool Jump(int refID, int position = 0);\r
// opens BAM file (and optional BAM index file, if provided)\r
// returns number of reference sequences\r
int GetReferenceCount(void) const;\r
// returns vector of reference objects\r
- const BamTools::RefVector GetReferenceData(void) const;\r
+ const BamTools::RefVector& GetReferenceData(void) const;\r
// returns reference id (used for BamReader::Jump()) for the given reference name\r
int GetReferenceID(const std::string& refName) const;\r
// returns the name of the file associated with this BamReader\r
// ----------------------\r
\r
// creates index for BAM file, saves to file (default = bamFilename + ".bai")\r
- bool CreateIndex(void);\r
-\r
+ bool CreateIndex(bool useDefaultIndex = true);\r
+ \r
// private implementation\r
private:\r
struct BamReaderPrivate;\r
BgzfData mBGZF;\r
bool IsBigEndian;\r
\r
- \r
// constructor / destructor\r
BamWriterPrivate(void) { \r
IsBigEndian = SystemIsBigEndian(); \r