1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 18 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
27 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
30 , m_isBigEndian(isBigEndian)
32 if ( m_reader && m_reader->IsOpen() )
33 m_references = m_reader->GetReferenceData();
36 bool BamIndex::HasAlignments(const int& referenceID) {
38 // return false if invalid ID
39 if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
42 // else return status of reference (has alignments?)
44 return m_references.at(referenceID).RefHasAlignments;
47 // #########################################################################################
48 // #########################################################################################
50 // -------------------------------
51 // BamStandardIndex structs & typedefs
55 // BAM index constants
56 const int MAX_BIN = 37450; // =(8^6-1)/7+1
57 const int BAM_LIDX_SHIFT = 14;
59 // --------------------------------------------------
60 // BamStandardIndex data structures & typedefs
68 Chunk(const uint64_t& start = 0,
69 const uint64_t& stop = 0)
75 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
76 return lhs.Start < rhs.Start;
79 typedef vector<Chunk> ChunkVector;
80 typedef map<uint32_t, ChunkVector> BamBinMap;
81 typedef vector<uint64_t> LinearOffsetVector;
83 struct ReferenceIndex {
87 LinearOffsetVector Offsets;
90 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
91 const LinearOffsetVector& offsets = LinearOffsetVector())
97 typedef vector<ReferenceIndex> BamStandardIndexData;
99 } // namespace BamTools
101 // -------------------------------
102 // BamStandardIndexPrivate implementation
104 struct BamStandardIndex::BamStandardIndexPrivate {
106 // -------------------------
109 BamStandardIndexData m_indexData;
110 BamStandardIndex* m_parent;
112 // -------------------------
115 BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { }
116 ~BamStandardIndexPrivate(void) { m_indexData.clear(); }
118 // -------------------------
121 // creates index data (in-memory) from current reader data
123 // attempts to use index to jump to region; returns success/fail
124 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
125 // loads existing data from file into memory
126 bool Load(const std::string& filename);
127 // writes in-memory index data out to file
128 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
129 bool Write(const std::string& bamFilename);
131 // -------------------------
134 // calculate bins that overlap region
135 int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
136 // calculates offset(s) for a given region
137 bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets);
138 // saves BAM bin entry for index
139 void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
140 // saves linear offset entry for index
141 void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
142 // simplifies index by merging 'chunks'
143 void MergeChunks(void);
146 // calculate bins that overlap region
147 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
149 // get region boundaries
150 uint32_t begin = (unsigned int)region.LeftPosition;
153 // if right bound specified AND left&right bounds are on same reference
154 // OK to use right bound position
155 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
156 end = (unsigned int)region.RightPosition;
158 // otherwise, use end of left bound reference as cutoff
160 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
162 // initialize list, bin '0' always a valid bin
166 // get rest of bins that contain this region
168 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
169 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
170 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
171 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
172 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
174 // return number of bins stored
178 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
180 // localize parent data
181 if ( m_parent == 0 ) return false;
182 BamReader* reader = m_parent->m_reader;
183 BgzfData* mBGZF = m_parent->m_BGZF;
184 RefVector& references = m_parent->m_references;
186 // be sure reader & BGZF file are valid & open for reading
187 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
190 // move file pointer to beginning of alignments
193 // get reference count, reserve index space
194 int numReferences = (int)references.size();
195 for ( int i = 0; i < numReferences; ++i )
196 m_indexData.push_back(ReferenceIndex());
198 // sets default constant for bin, ID, offset, coordinate variables
199 const uint32_t defaultValue = 0xffffffffu;
202 uint32_t saveBin(defaultValue);
203 uint32_t lastBin(defaultValue);
206 int32_t saveRefID(defaultValue);
207 int32_t lastRefID(defaultValue);
210 uint64_t saveOffset = mBGZF->Tell();
211 uint64_t lastOffset = saveOffset;
214 int32_t lastCoordinate = defaultValue;
216 BamAlignment bAlignment;
217 while ( reader->GetNextAlignmentCore(bAlignment) ) {
219 // change of chromosome, save ID, reset bin
220 if ( lastRefID != bAlignment.RefID ) {
221 lastRefID = bAlignment.RefID;
222 lastBin = defaultValue;
225 // if lastCoordinate greater than BAM position - file not sorted properly
226 else if ( lastCoordinate > bAlignment.Position ) {
227 fprintf(stderr, "BAM file not properly sorted:\n");
228 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
232 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
233 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
235 // save linear offset entry (matched to BAM entry refID)
236 ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
237 LinearOffsetVector& offsets = refIndex.Offsets;
238 InsertLinearOffset(offsets, bAlignment, lastOffset);
241 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
242 if ( bAlignment.Bin != lastBin ) {
244 // if not first time through
245 if ( saveBin != defaultValue ) {
247 // save Bam bin entry
248 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
249 BamBinMap& binMap = refIndex.Bins;
250 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
254 saveOffset = lastOffset;
257 saveBin = bAlignment.Bin;
258 lastBin = bAlignment.Bin;
261 saveRefID = bAlignment.RefID;
263 // if invalid RefID, break out
264 if ( saveRefID < 0 ) break;
267 // make sure that current file pointer is beyond lastOffset
268 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
269 fprintf(stderr, "Error in BGZF offsets.\n");
274 lastOffset = mBGZF->Tell();
276 // update lastCoordinate
277 lastCoordinate = bAlignment.Position;
280 // save any leftover BAM data (as long as refID is valid)
281 if ( saveRefID >= 0 ) {
282 // save Bam bin entry
283 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
284 BamBinMap& binMap = refIndex.Bins;
285 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
288 // simplify index by merging chunks
291 // iterate through references in index
292 // store whether reference has data &
293 // sort offsets in linear offset vector
294 BamStandardIndexData::iterator indexIter = m_indexData.begin();
295 BamStandardIndexData::iterator indexEnd = m_indexData.end();
296 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
298 // get reference index data
299 ReferenceIndex& refIndex = (*indexIter);
300 BamBinMap& binMap = refIndex.Bins;
301 LinearOffsetVector& offsets = refIndex.Offsets;
303 // store whether reference has alignments or no
304 references[i].RefHasAlignments = ( binMap.size() > 0 );
306 // sort linear offsets
307 sort(offsets.begin(), offsets.end());
310 // rewind file pointer to beginning of alignments, return success/fail
311 return reader->Rewind();
314 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
316 // calculate which bins overlap this region
317 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
318 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
320 // get bins for this reference
321 const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID);
322 const BamBinMap& binMap = refIndex.Bins;
324 // get minimum offset to consider
325 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
326 uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
328 // store all alignment 'chunk' starts (file offsets) for bins in this region
329 for ( int i = 0; i < numBins; ++i ) {
331 const uint16_t binKey = bins[i];
332 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
333 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
335 // iterate over chunks
336 const ChunkVector& chunks = (*binIter).second;
337 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
338 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
339 for ( ; chunksIter != chunksEnd; ++chunksIter) {
341 // if valid chunk found, store its file offset
342 const Chunk& chunk = (*chunksIter);
343 if ( chunk.Stop > minOffset )
344 offsets.push_back( chunk.Start );
352 // sort the offsets before returning
353 sort(offsets.begin(), offsets.end());
355 // return whether any offsets were found
356 return ( offsets.size() != 0 );
359 // saves BAM bin entry for index
360 void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap,
361 const uint32_t& saveBin,
362 const uint64_t& saveOffset,
363 const uint64_t& lastOffset)
366 BamBinMap::iterator binIter = binMap.find(saveBin);
369 Chunk newChunk(saveOffset, lastOffset);
371 // if entry doesn't exist
372 if ( binIter == binMap.end() ) {
373 ChunkVector newChunks;
374 newChunks.push_back(newChunk);
375 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
380 ChunkVector& binChunks = (*binIter).second;
381 binChunks.push_back( newChunk );
385 // saves linear offset entry for index
386 void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
387 const BamAlignment& bAlignment,
388 const uint64_t& lastOffset)
390 // get converted offsets
391 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
392 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
394 // resize vector if necessary
395 int oldSize = offsets.size();
396 int newSize = endOffset + 1;
397 if ( oldSize < newSize )
398 offsets.resize(newSize, 0);
401 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
402 if ( offsets[i] == 0 )
403 offsets[i] = lastOffset;
407 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
409 // localize parent data
410 if ( m_parent == 0 ) return false;
411 BamReader* reader = m_parent->m_reader;
412 BgzfData* mBGZF = m_parent->m_BGZF;
413 RefVector& references = m_parent->m_references;
415 // be sure reader & BGZF file are valid & open for reading
416 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
419 // see if left-bound reference of region has alignments
420 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
422 // make sure left-bound position is valid
423 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
425 // calculate offsets for this region
426 // if failed, print message, set flag, and return failure
427 vector<int64_t> offsets;
428 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
429 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
430 *hasAlignmentsInRegion = false;
434 // iterate through offsets
435 BamAlignment bAlignment;
437 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
439 // attempt seek & load first available alignment
440 // set flag to true if data exists
441 result &= mBGZF->Seek(*o);
442 *hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment);
444 // if this alignment corresponds to desired position
445 // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
446 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
447 if ( o != offsets.begin() ) --o;
448 return mBGZF->Seek(*o);
452 // if error in jumping, print message & set flag
454 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
455 *hasAlignmentsInRegion = false;
458 // return success/failure
462 bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) {
464 // localize parent data
465 if ( m_parent == 0 ) return false;
466 const bool isBigEndian = m_parent->m_isBigEndian;
467 RefVector& references = m_parent->m_references;
469 // open index file, abort on error
470 FILE* indexStream = fopen(filename.c_str(), "rb");
472 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
476 // set placeholder to receive input byte count (suppresses compiler warnings)
477 size_t elementsRead = 0;
479 // see if index is valid BAM index
481 elementsRead = fread(magic, 1, 4, indexStream);
482 if ( strncmp(magic, "BAI\1", 4) ) {
483 fprintf(stderr, "Problem with index file - invalid format.\n");
488 // get number of reference sequences
490 elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
491 if ( isBigEndian ) SwapEndian_32(numRefSeqs);
493 // intialize space for BamStandardIndexData data structure
494 m_indexData.reserve(numRefSeqs);
496 // iterate over reference sequences
497 for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
499 // get number of bins for this reference sequence
501 elementsRead = fread(&numBins, 4, 1, indexStream);
502 if ( isBigEndian ) SwapEndian_32(numBins);
505 RefData& refEntry = references[i];
506 refEntry.RefHasAlignments = true;
509 // intialize BinVector
512 // iterate over bins for that reference sequence
513 for ( int j = 0; j < numBins; ++j ) {
517 elementsRead = fread(&binID, 4, 1, indexStream);
519 // get number of regionChunks in this bin
521 elementsRead = fread(&numChunks, 4, 1, indexStream);
524 SwapEndian_32(binID);
525 SwapEndian_32(numChunks);
528 // intialize ChunkVector
529 ChunkVector regionChunks;
530 regionChunks.reserve(numChunks);
532 // iterate over regionChunks in this bin
533 for ( unsigned int k = 0; k < numChunks; ++k ) {
535 // get chunk boundaries (left, right)
538 elementsRead = fread(&left, 8, 1, indexStream);
539 elementsRead = fread(&right, 8, 1, indexStream);
543 SwapEndian_64(right);
547 regionChunks.push_back( Chunk(left, right) );
550 // sort chunks for this bin
551 sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
553 // save binID, chunkVector for this bin
554 binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
557 // -----------------------------------------------------
558 // load linear index for this reference sequence
560 // get number of linear offsets
561 int32_t numLinearOffsets;
562 elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
563 if ( isBigEndian ) SwapEndian_32(numLinearOffsets);
565 // intialize LinearOffsetVector
566 LinearOffsetVector offsets;
567 offsets.reserve(numLinearOffsets);
569 // iterate over linear offsets for this reference sequeence
570 uint64_t linearOffset;
571 for ( int j = 0; j < numLinearOffsets; ++j ) {
572 // read a linear offset & store
573 elementsRead = fread(&linearOffset, 8, 1, indexStream);
574 if ( isBigEndian ) SwapEndian_64(linearOffset);
575 offsets.push_back(linearOffset);
578 // sort linear offsets
579 sort( offsets.begin(), offsets.end() );
581 // store index data for that reference sequence
582 m_indexData.push_back( ReferenceIndex(binMap, offsets) );
585 // close index file (.bai) and return
590 // merges 'alignment chunks' in BAM bin (used for index building)
591 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
593 // iterate over reference enties
594 BamStandardIndexData::iterator indexIter = m_indexData.begin();
595 BamStandardIndexData::iterator indexEnd = m_indexData.end();
596 for ( ; indexIter != indexEnd; ++indexIter ) {
598 // get BAM bin map for this reference
599 ReferenceIndex& refIndex = (*indexIter);
600 BamBinMap& bamBinMap = refIndex.Bins;
602 // iterate over BAM bins
603 BamBinMap::iterator binIter = bamBinMap.begin();
604 BamBinMap::iterator binEnd = bamBinMap.end();
605 for ( ; binIter != binEnd; ++binIter ) {
607 // get chunk vector for this bin
608 ChunkVector& binChunks = (*binIter).second;
609 if ( binChunks.size() == 0 ) continue;
611 ChunkVector mergedChunks;
612 mergedChunks.push_back( binChunks[0] );
614 // iterate over chunks
616 ChunkVector::iterator chunkIter = binChunks.begin();
617 ChunkVector::iterator chunkEnd = binChunks.end();
618 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
620 // get 'currentChunk' based on numeric index
621 Chunk& currentChunk = mergedChunks[i];
623 // get iteratorChunk based on vector iterator
624 Chunk& iteratorChunk = (*chunkIter);
626 // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
627 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
629 // set currentChunk.Stop to iteratorChunk.Stop
630 currentChunk.Stop = iteratorChunk.Stop;
635 // set currentChunk + 1 to iteratorChunk
636 mergedChunks.push_back(iteratorChunk);
641 // saved merged chunk vector
642 (*binIter).second = mergedChunks;
647 // writes in-memory index data out to file
648 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
649 bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) {
651 // localize parent data
652 if ( m_parent == 0 ) return false;
653 const bool isBigEndian = m_parent->m_isBigEndian;
655 string indexFilename = bamFilename + ".bai";
656 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
657 if ( indexStream == 0 ) {
658 fprintf(stderr, "ERROR: Could not open file to save index.\n");
662 // write BAM index header
663 fwrite("BAI\1", 1, 4, indexStream);
665 // write number of reference sequences
666 int32_t numReferenceSeqs = m_indexData.size();
667 if ( isBigEndian ) SwapEndian_32(numReferenceSeqs);
668 fwrite(&numReferenceSeqs, 4, 1, indexStream);
670 // iterate over reference sequences
671 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
672 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
673 for ( ; indexIter != indexEnd; ++ indexIter ) {
675 // get reference index data
676 const ReferenceIndex& refIndex = (*indexIter);
677 const BamBinMap& binMap = refIndex.Bins;
678 const LinearOffsetVector& offsets = refIndex.Offsets;
680 // write number of bins
681 int32_t binCount = binMap.size();
682 if ( isBigEndian ) SwapEndian_32(binCount);
683 fwrite(&binCount, 4, 1, indexStream);
686 BamBinMap::const_iterator binIter = binMap.begin();
687 BamBinMap::const_iterator binEnd = binMap.end();
688 for ( ; binIter != binEnd; ++binIter ) {
690 // get bin data (key and chunk vector)
691 uint32_t binKey = (*binIter).first;
692 const ChunkVector& binChunks = (*binIter).second;
695 if ( isBigEndian ) SwapEndian_32(binKey);
696 fwrite(&binKey, 4, 1, indexStream);
699 int32_t chunkCount = binChunks.size();
700 if ( isBigEndian ) SwapEndian_32(chunkCount);
701 fwrite(&chunkCount, 4, 1, indexStream);
703 // iterate over chunks
704 ChunkVector::const_iterator chunkIter = binChunks.begin();
705 ChunkVector::const_iterator chunkEnd = binChunks.end();
706 for ( ; chunkIter != chunkEnd; ++chunkIter ) {
708 // get current chunk data
709 const Chunk& chunk = (*chunkIter);
710 uint64_t start = chunk.Start;
711 uint64_t stop = chunk.Stop;
714 SwapEndian_64(start);
718 // save chunk offsets
719 fwrite(&start, 8, 1, indexStream);
720 fwrite(&stop, 8, 1, indexStream);
724 // write linear offsets size
725 int32_t offsetSize = offsets.size();
726 if ( isBigEndian ) SwapEndian_32(offsetSize);
727 fwrite(&offsetSize, 4, 1, indexStream);
729 // iterate over linear offsets
730 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
731 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
732 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
734 // write linear offset value
735 uint64_t linearOffset = (*offsetIter);
736 if ( isBigEndian ) SwapEndian_64(linearOffset);
737 fwrite(&linearOffset, 8, 1, indexStream);
741 // flush buffer, close file, and return success
747 // ---------------------------------------------------------------
748 // BamStandardIndex implementation
750 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
751 : BamIndex(bgzf, reader, isBigEndian)
753 d = new BamStandardIndexPrivate(this);
756 BamStandardIndex::~BamStandardIndex(void) {
761 // creates index data (in-memory) from current reader data
762 bool BamStandardIndex::Build(void) { return d->Build(); }
764 // attempts to use index to jump to region; returns success/fail
765 bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
767 // loads existing data from file into memory
768 bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); }
770 // writes in-memory index data out to file
771 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
772 bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }
774 // #########################################################################################
775 // #########################################################################################
777 // ---------------------------------------------------
778 // BamToolsIndex structs & typedefs
782 // individual index entry
783 struct BamToolsIndexEntry {
786 int32_t MaxEndPosition;
788 int32_t StartPosition;
791 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
792 const int64_t& startOffset = 0,
793 const int32_t& startPosition = 0)
794 : MaxEndPosition(maxEndPosition)
795 , StartOffset(startOffset)
796 , StartPosition(startPosition)
800 // the actual index data structure
801 typedef map<int, vector<BamToolsIndexEntry> > BamToolsIndexData;
803 } // namespace BamTools
805 // ---------------------------------------------------
806 // BamToolsIndexPrivate implementation
808 struct BamToolsIndex::BamToolsIndexPrivate {
810 // keep a list of any supported versions here
811 // (might be useful later to handle any 'legacy' versions if the format changes)
812 // listed for example like: BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
814 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
816 // if ( indexVersion >= BTI_1_2 )
820 enum Version { BTI_1_0 = 1
824 // -------------------------
826 BamToolsIndexData m_indexData;
827 BamToolsIndex* m_parent;
831 // -------------------------
834 BamToolsIndexPrivate(BamToolsIndex* parent)
837 , m_version(BTI_1_1) // latest version - used for writing new index files
840 ~BamToolsIndexPrivate(void) { }
842 // -------------------------
845 // creates index data (in-memory) from current reader data
847 // returns supported file extension
848 const std::string Extension(void) const { return std::string(".bti"); }
849 // attempts to use index to jump to region; returns success/fail
850 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
851 // loads existing data from file into memory
852 bool Load(const std::string& filename);
853 // writes in-memory index data out to file
854 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
855 bool Write(const std::string& bamFilename);
857 // -------------------------
860 // calculates offset for a given region
861 bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
864 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
866 // localize parent data
867 if ( m_parent == 0 ) return false;
868 BamReader* reader = m_parent->m_reader;
869 BgzfData* mBGZF = m_parent->m_BGZF;
870 RefVector& references = m_parent->m_references;
872 // be sure reader & BGZF file are valid & open for reading
873 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
876 // move file pointer to beginning of alignments
878 if ( !reader->Rewind() )
881 // set up counters and markers
882 int32_t currentBlockCount = 0;
883 int64_t currentAlignmentOffset = mBGZF->Tell();
884 int32_t blockRefId = 0;
885 int32_t blockMaxEndPosition = 0;
886 int64_t blockStartOffset = currentAlignmentOffset;
887 int32_t blockStartPosition = -1;
889 // plow through alignments, storing index entries
891 while ( reader->GetNextAlignmentCore(al) ) {
893 // if block contains data (not the first time through) AND alignment is on a new reference
894 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
896 // make sure reference flag is set
897 references[blockRefId].RefHasAlignments = true;
899 // store previous data
900 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
902 // intialize new block
903 currentBlockCount = 0;
904 blockMaxEndPosition = al.GetEndPosition();
905 blockStartOffset = currentAlignmentOffset;
908 // if beginning of block, save first alignment's refID & position
909 if ( currentBlockCount == 0 ) {
910 blockRefId = al.RefID;
911 blockStartPosition = al.Position;
914 // increment block counter
917 // check end position
918 int32_t alignmentEndPosition = al.GetEndPosition();
919 if ( alignmentEndPosition > blockMaxEndPosition )
920 blockMaxEndPosition = alignmentEndPosition;
922 // if block is full, get offset for next block, reset currentBlockCount
923 if ( currentBlockCount == m_blockSize ) {
925 // make sure reference flag is set
926 references[blockRefId].RefHasAlignments = true;
928 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
929 blockStartOffset = mBGZF->Tell();
930 currentBlockCount = 0;
933 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
934 // necessary because we won't know if this next alignment is on a new reference until we actually read it
935 currentAlignmentOffset = mBGZF->Tell();
939 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
941 // attempt to rewind back to begnning of alignments
942 // return success/fail
943 return reader->Rewind();
946 // N.B. - ignores isRightBoundSpecified
947 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
949 // return false if leftBound refID is not found in index data
950 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false;
952 const vector<BamToolsIndexEntry> referenceOffsets = m_indexData[region.LeftRefID];
953 if ( referenceOffsets.empty() ) return false;
955 // -------------------------------------------------------
956 // calculate nearest index to jump to
959 offset = (*referenceOffsets.begin()).StartOffset;
961 // iterate over offsets entries on this reference
962 vector<BamToolsIndexEntry>::const_iterator indexIter = referenceOffsets.begin();
963 vector<BamToolsIndexEntry>::const_iterator indexEnd = referenceOffsets.end();
964 for ( ; indexIter != indexEnd; ++indexIter ) {
965 const BamToolsIndexEntry& entry = (*indexIter);
966 // break if alignment 'entry' overlaps region
967 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
968 offset = (*indexIter).StartOffset;
971 // set flag based on whether an index entry was found for this region
972 *hasAlignmentsInRegion = ( indexIter != indexEnd );
976 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
979 *hasAlignmentsInRegion = false;
981 // localize parent data
982 if ( m_parent == 0 ) return false;
983 BamReader* reader = m_parent->m_reader;
984 BgzfData* mBGZF = m_parent->m_BGZF;
985 RefVector& references = m_parent->m_references;
987 // check valid BamReader state
988 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
989 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
993 // see if left-bound reference of region has alignments
994 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
996 // make sure left-bound position is valid
997 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
999 // calculate nearest offset to jump to
1001 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
1002 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
1006 // attempt seek in file, return success/fail
1007 return mBGZF->Seek(offset);
1010 bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) {
1012 // localize parent data
1013 if ( m_parent == 0 ) return false;
1014 const bool isBigEndian = m_parent->m_isBigEndian;
1015 RefVector& references = m_parent->m_references;
1017 // open index file, abort on error
1018 FILE* indexStream = fopen(filename.c_str(), "rb");
1019 if( !indexStream ) {
1020 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
1024 // set placeholder to receive input byte count (suppresses compiler warnings)
1025 size_t elementsRead = 0;
1027 // see if index is valid BAM index
1029 elementsRead = fread(magic, 1, 4, indexStream);
1030 if ( strncmp(magic, "BTI\1", 4) ) {
1031 fprintf(stderr, "Problem with index file - invalid format.\n");
1032 fclose(indexStream);
1037 int32_t indexVersion;
1038 elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream);
1039 if ( isBigEndian ) SwapEndian_32(indexVersion);
1040 if ( indexVersion <= 0 || indexVersion > m_version ) {
1041 fprintf(stderr, "Problem with index file - unsupported version.\n");
1042 fclose(indexStream);
1046 if ( (Version)indexVersion < BTI_1_1 ) {
1047 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
1048 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1049 fclose(indexStream);
1053 // read in block size
1054 elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream);
1055 if ( isBigEndian ) SwapEndian_32(m_blockSize);
1057 // read in number of references
1058 int32_t numReferences;
1059 elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream);
1060 if ( isBigEndian ) SwapEndian_32(numReferences);
1062 // iterate over reference entries
1063 for ( int i = 0; i < numReferences; ++i ) {
1065 // read in number of offsets for this reference
1066 uint32_t numOffsets;
1067 elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
1068 if ( isBigEndian ) SwapEndian_32(numOffsets);
1070 // initialize offsets container for this reference
1071 vector<BamToolsIndexEntry> offsets;
1072 offsets.reserve(numOffsets);
1074 // iterate over offset entries
1075 for ( unsigned int j = 0; j < numOffsets; ++j ) {
1078 int32_t maxEndPosition;
1079 int64_t startOffset;
1080 int32_t startPosition;
1083 elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1084 elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream);
1085 elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream);
1087 // swap endian-ness if necessary
1088 if ( isBigEndian ) {
1089 SwapEndian_32(maxEndPosition);
1090 SwapEndian_64(startOffset);
1091 SwapEndian_32(startPosition);
1094 // save current index entry
1095 offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) );
1098 // save refID, offsetVector entry into data structure
1099 m_indexData.insert( make_pair(i, offsets) );
1101 // set ref.HasAlignments flag
1102 references[i].RefHasAlignments = (numOffsets != 0);
1105 // close index file and return
1106 fclose(indexStream);
1110 // writes in-memory index data out to file
1111 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1112 bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) {
1114 // localize parent data
1115 if ( m_parent == 0 ) return false;
1116 const bool isBigEndian = m_parent->m_isBigEndian;
1118 // open index file for writing
1119 string indexFilename = bamFilename + ".bti";
1120 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
1121 if ( indexStream == 0 ) {
1122 fprintf(stderr, "ERROR: Could not open file to save index.\n");
1126 // write BTI index format 'magic number'
1127 fwrite("BTI\1", 1, 4, indexStream);
1129 // write BTI index format version
1130 int32_t currentVersion = (int32_t)m_version;
1131 if ( isBigEndian ) SwapEndian_32(currentVersion);
1132 fwrite(¤tVersion, sizeof(int32_t), 1, indexStream);
1135 int32_t blockSize = m_blockSize;
1136 if ( isBigEndian ) SwapEndian_32(blockSize);
1137 fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
1139 // write number of references
1140 int32_t numReferences = (int32_t)m_indexData.size();
1141 if ( isBigEndian ) SwapEndian_32(numReferences);
1142 fwrite(&numReferences, sizeof(numReferences), 1, indexStream);
1144 // iterate through references in index
1145 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1146 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1147 for ( ; refIter != refEnd; ++refIter ) {
1149 const vector<BamToolsIndexEntry> offsets = (*refIter).second;
1151 // write number of offsets listed for this reference
1152 uint32_t numOffsets = offsets.size();
1153 if ( isBigEndian ) SwapEndian_32(numOffsets);
1154 fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
1156 // iterate over offset entries
1157 vector<BamToolsIndexEntry>::const_iterator offsetIter = offsets.begin();
1158 vector<BamToolsIndexEntry>::const_iterator offsetEnd = offsets.end();
1159 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1161 // get reference index data
1162 const BamToolsIndexEntry& entry = (*offsetIter);
1165 int32_t maxEndPosition = entry.MaxEndPosition;
1166 int64_t startOffset = entry.StartOffset;
1167 int32_t startPosition = entry.StartPosition;
1169 // swap endian-ness if necessary
1170 if ( isBigEndian ) {
1171 SwapEndian_32(maxEndPosition);
1172 SwapEndian_64(startOffset);
1173 SwapEndian_32(startPosition);
1176 // write the reference index entry
1177 fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1178 fwrite(&startOffset, sizeof(startOffset), 1, indexStream);
1179 fwrite(&startPosition, sizeof(startPosition), 1, indexStream);
1183 // flush any remaining output, close file, and return success
1184 fflush(indexStream);
1185 fclose(indexStream);
1189 // ---------------------------------------------------
1190 // BamToolsIndex implementation
1192 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
1193 : BamIndex(bgzf, reader, isBigEndian)
1195 d = new BamToolsIndexPrivate(this);
1198 BamToolsIndex::~BamToolsIndex(void) {
1203 // creates index data (in-memory) from current reader data
1204 bool BamToolsIndex::Build(void) { return d->Build(); }
1206 // attempts to use index to jump to region; returns success/fail
1207 // a "successful" jump indicates no error, but not whether this region has data
1208 // * thus, the method sets a flag to indicate whether there are alignments
1209 // available after the jump position
1210 bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
1212 // loads existing data from file into memory
1213 bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); }
1215 // writes in-memory index data out to file
1216 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1217 bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }