1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 10 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
27 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
30 , m_isBigEndian(isBigEndian)
32 if ( m_reader && m_reader->IsOpen() )
33 m_references = m_reader->GetReferenceData();
36 bool BamIndex::HasAlignments(const int& referenceID) {
38 // return false if invalid ID
39 if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
42 // else return status of reference (has alignments?)
44 return m_references.at(referenceID).RefHasAlignments;
47 // #########################################################################################
48 // #########################################################################################
50 // -------------------------------
51 // BamStandardIndex structs & typedefs
55 // --------------------------------------------------
56 // BamStandardIndex data structures & typedefs
64 Chunk(const uint64_t& start = 0,
65 const uint64_t& stop = 0)
71 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
72 return lhs.Start < rhs.Start;
75 typedef vector<Chunk> ChunkVector;
76 typedef map<uint32_t, ChunkVector> BamBinMap;
77 typedef vector<uint64_t> LinearOffsetVector;
79 struct ReferenceIndex {
83 LinearOffsetVector Offsets;
86 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
87 const LinearOffsetVector& offsets = LinearOffsetVector())
93 typedef vector<ReferenceIndex> BamStandardIndexData;
95 } // namespace BamTools
97 // -------------------------------
98 // BamStandardIndexPrivate implementation
100 struct BamStandardIndex::BamStandardIndexPrivate {
102 // -------------------------
105 BamStandardIndexData m_indexData;
106 BamStandardIndex* m_parent;
108 // -------------------------
111 BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { }
112 ~BamStandardIndexPrivate(void) { m_indexData.clear(); }
114 // -------------------------
117 // creates index data (in-memory) from current reader data
119 // attempts to use index to jump to region; returns success/fail
120 bool Jump(const BamTools::BamRegion& region);
121 // loads existing data from file into memory
122 bool Load(const std::string& filename);
123 // writes in-memory index data out to file
124 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
125 bool Write(const std::string& bamFilename);
127 // -------------------------
130 // calculate bins that overlap region
131 int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
132 // calculates offset(s) for a given region
133 bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets);
134 // saves BAM bin entry for index
135 void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
136 // saves linear offset entry for index
137 void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
138 // simplifies index by merging 'chunks'
139 void MergeChunks(void);
142 // calculate bins that overlap region
143 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
145 // get region boundaries
146 uint32_t begin = (unsigned int)region.LeftPosition;
149 // if right bound specified AND left&right bounds are on same reference
150 // OK to use right bound position
151 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
152 end = (unsigned int)region.RightPosition;
154 // otherwise, use end of left bound reference as cutoff
156 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
158 // initialize list, bin '0' always a valid bin
162 // get rest of bins that contain this region
164 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
165 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
166 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
167 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
168 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
170 // return number of bins stored
174 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
176 // localize parent data
177 if ( m_parent == 0 ) return false;
178 BamReader* reader = m_parent->m_reader;
179 BgzfData* mBGZF = m_parent->m_BGZF;
180 RefVector& references = m_parent->m_references;
182 // be sure reader & BGZF file are valid & open for reading
183 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
186 // move file pointer to beginning of alignments
189 // get reference count, reserve index space
190 int numReferences = (int)references.size();
191 for ( int i = 0; i < numReferences; ++i )
192 m_indexData.push_back(ReferenceIndex());
194 // sets default constant for bin, ID, offset, coordinate variables
195 const uint32_t defaultValue = 0xffffffffu;
198 uint32_t saveBin(defaultValue);
199 uint32_t lastBin(defaultValue);
202 int32_t saveRefID(defaultValue);
203 int32_t lastRefID(defaultValue);
206 uint64_t saveOffset = mBGZF->Tell();
207 uint64_t lastOffset = saveOffset;
210 int32_t lastCoordinate = defaultValue;
212 BamAlignment bAlignment;
213 while ( reader->GetNextAlignmentCore(bAlignment) ) {
215 // change of chromosome, save ID, reset bin
216 if ( lastRefID != bAlignment.RefID ) {
217 lastRefID = bAlignment.RefID;
218 lastBin = defaultValue;
221 // if lastCoordinate greater than BAM position - file not sorted properly
222 else if ( lastCoordinate > bAlignment.Position ) {
223 fprintf(stderr, "BAM file not properly sorted:\n");
224 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
228 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
229 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
231 // save linear offset entry (matched to BAM entry refID)
232 ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
233 LinearOffsetVector& offsets = refIndex.Offsets;
234 InsertLinearOffset(offsets, bAlignment, lastOffset);
237 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
238 if ( bAlignment.Bin != lastBin ) {
240 // if not first time through
241 if ( saveBin != defaultValue ) {
243 // save Bam bin entry
244 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
245 BamBinMap& binMap = refIndex.Bins;
246 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
250 saveOffset = lastOffset;
253 saveBin = bAlignment.Bin;
254 lastBin = bAlignment.Bin;
257 saveRefID = bAlignment.RefID;
259 // if invalid RefID, break out
260 if ( saveRefID < 0 ) break;
263 // make sure that current file pointer is beyond lastOffset
264 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
265 fprintf(stderr, "Error in BGZF offsets.\n");
270 lastOffset = mBGZF->Tell();
272 // update lastCoordinate
273 lastCoordinate = bAlignment.Position;
276 // save any leftover BAM data (as long as refID is valid)
277 if ( saveRefID >= 0 ) {
278 // save Bam bin entry
279 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
280 BamBinMap& binMap = refIndex.Bins;
281 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
284 // simplify index by merging chunks
287 // iterate through references in index
288 // store whether reference has data &
289 // sort offsets in linear offset vector
290 BamStandardIndexData::iterator indexIter = m_indexData.begin();
291 BamStandardIndexData::iterator indexEnd = m_indexData.end();
292 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
294 // get reference index data
295 ReferenceIndex& refIndex = (*indexIter);
296 BamBinMap& binMap = refIndex.Bins;
297 LinearOffsetVector& offsets = refIndex.Offsets;
299 // store whether reference has alignments or no
300 references[i].RefHasAlignments = ( binMap.size() > 0 );
302 // sort linear offsets
303 sort(offsets.begin(), offsets.end());
306 // rewind file pointer to beginning of alignments, return success/fail
307 return reader->Rewind();
310 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
312 // calculate which bins overlap this region
313 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
314 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
316 // get bins for this reference
317 const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID);
318 const BamBinMap& binMap = refIndex.Bins;
320 // get minimum offset to consider
321 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
322 uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
324 // store all alignment 'chunk' starts (file offsets) for bins in this region
325 for ( int i = 0; i < numBins; ++i ) {
327 const uint16_t binKey = bins[i];
328 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
329 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
331 // iterate over chunks
332 const ChunkVector& chunks = (*binIter).second;
333 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
334 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
335 for ( ; chunksIter != chunksEnd; ++chunksIter) {
337 // if valid chunk found, store its file offset
338 const Chunk& chunk = (*chunksIter);
339 if ( chunk.Stop > minOffset )
340 offsets.push_back( chunk.Start );
348 // sort the offsets before returning
349 sort(offsets.begin(), offsets.end());
351 // return whether any offsets were found
352 return ( offsets.size() != 0 );
355 // saves BAM bin entry for index
356 void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap,
357 const uint32_t& saveBin,
358 const uint64_t& saveOffset,
359 const uint64_t& lastOffset)
362 BamBinMap::iterator binIter = binMap.find(saveBin);
365 Chunk newChunk(saveOffset, lastOffset);
367 // if entry doesn't exist
368 if ( binIter == binMap.end() ) {
369 ChunkVector newChunks;
370 newChunks.push_back(newChunk);
371 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
376 ChunkVector& binChunks = (*binIter).second;
377 binChunks.push_back( newChunk );
381 // saves linear offset entry for index
382 void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
383 const BamAlignment& bAlignment,
384 const uint64_t& lastOffset)
386 // get converted offsets
387 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
388 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
390 // resize vector if necessary
391 int oldSize = offsets.size();
392 int newSize = endOffset + 1;
393 if ( oldSize < newSize )
394 offsets.resize(newSize, 0);
397 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
398 if ( offsets[i] == 0 )
399 offsets[i] = lastOffset;
403 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) {
405 // localize parent data
406 if ( m_parent == 0 ) return false;
407 BamReader* reader = m_parent->m_reader;
408 BgzfData* mBGZF = m_parent->m_BGZF;
409 RefVector& references = m_parent->m_references;
411 // be sure reader & BGZF file are valid & open for reading
412 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
415 // see if left-bound reference of region has alignments
416 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
418 // make sure left-bound position is valid
419 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
421 vector<int64_t> offsets;
422 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
423 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
427 // iterate through offsets
428 BamAlignment bAlignment;
430 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
432 // attempt seek & load first available alignment
433 result &= mBGZF->Seek(*o);
434 reader->GetNextAlignmentCore(bAlignment);
436 // if this alignment corresponds to desired position
437 // return success of seeking back to 'current offset'
438 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
439 if ( o != offsets.begin() ) --o;
440 return mBGZF->Seek(*o);
444 if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
448 bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) {
450 // localize parent data
451 if ( m_parent == 0 ) return false;
452 const bool isBigEndian = m_parent->m_isBigEndian;
453 RefVector& references = m_parent->m_references;
455 // open index file, abort on error
456 FILE* indexStream = fopen(filename.c_str(), "rb");
458 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
462 // set placeholder to receive input byte count (suppresses compiler warnings)
463 size_t elementsRead = 0;
465 // see if index is valid BAM index
467 elementsRead = fread(magic, 1, 4, indexStream);
468 if ( strncmp(magic, "BAI\1", 4) ) {
469 fprintf(stderr, "Problem with index file - invalid format.\n");
474 // get number of reference sequences
476 elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
477 if ( isBigEndian ) SwapEndian_32(numRefSeqs);
479 // intialize space for BamStandardIndexData data structure
480 m_indexData.reserve(numRefSeqs);
482 // iterate over reference sequences
483 for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
485 // get number of bins for this reference sequence
487 elementsRead = fread(&numBins, 4, 1, indexStream);
488 if ( isBigEndian ) SwapEndian_32(numBins);
491 RefData& refEntry = references[i];
492 refEntry.RefHasAlignments = true;
495 // intialize BinVector
498 // iterate over bins for that reference sequence
499 for ( int j = 0; j < numBins; ++j ) {
503 elementsRead = fread(&binID, 4, 1, indexStream);
505 // get number of regionChunks in this bin
507 elementsRead = fread(&numChunks, 4, 1, indexStream);
510 SwapEndian_32(binID);
511 SwapEndian_32(numChunks);
514 // intialize ChunkVector
515 ChunkVector regionChunks;
516 regionChunks.reserve(numChunks);
518 // iterate over regionChunks in this bin
519 for ( unsigned int k = 0; k < numChunks; ++k ) {
521 // get chunk boundaries (left, right)
524 elementsRead = fread(&left, 8, 1, indexStream);
525 elementsRead = fread(&right, 8, 1, indexStream);
529 SwapEndian_64(right);
533 regionChunks.push_back( Chunk(left, right) );
536 // sort chunks for this bin
537 sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
539 // save binID, chunkVector for this bin
540 binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
543 // -----------------------------------------------------
544 // load linear index for this reference sequence
546 // get number of linear offsets
547 int32_t numLinearOffsets;
548 elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
549 if ( isBigEndian ) SwapEndian_32(numLinearOffsets);
551 // intialize LinearOffsetVector
552 LinearOffsetVector offsets;
553 offsets.reserve(numLinearOffsets);
555 // iterate over linear offsets for this reference sequeence
556 uint64_t linearOffset;
557 for ( int j = 0; j < numLinearOffsets; ++j ) {
558 // read a linear offset & store
559 elementsRead = fread(&linearOffset, 8, 1, indexStream);
560 if ( isBigEndian ) SwapEndian_64(linearOffset);
561 offsets.push_back(linearOffset);
564 // sort linear offsets
565 sort( offsets.begin(), offsets.end() );
567 // store index data for that reference sequence
568 m_indexData.push_back( ReferenceIndex(binMap, offsets) );
571 // close index file (.bai) and return
576 // merges 'alignment chunks' in BAM bin (used for index building)
577 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
579 // iterate over reference enties
580 BamStandardIndexData::iterator indexIter = m_indexData.begin();
581 BamStandardIndexData::iterator indexEnd = m_indexData.end();
582 for ( ; indexIter != indexEnd; ++indexIter ) {
584 // get BAM bin map for this reference
585 ReferenceIndex& refIndex = (*indexIter);
586 BamBinMap& bamBinMap = refIndex.Bins;
588 // iterate over BAM bins
589 BamBinMap::iterator binIter = bamBinMap.begin();
590 BamBinMap::iterator binEnd = bamBinMap.end();
591 for ( ; binIter != binEnd; ++binIter ) {
593 // get chunk vector for this bin
594 ChunkVector& binChunks = (*binIter).second;
595 if ( binChunks.size() == 0 ) continue;
597 ChunkVector mergedChunks;
598 mergedChunks.push_back( binChunks[0] );
600 // iterate over chunks
602 ChunkVector::iterator chunkIter = binChunks.begin();
603 ChunkVector::iterator chunkEnd = binChunks.end();
604 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
606 // get 'currentChunk' based on numeric index
607 Chunk& currentChunk = mergedChunks[i];
609 // get iteratorChunk based on vector iterator
610 Chunk& iteratorChunk = (*chunkIter);
612 // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
613 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
615 // set currentChunk.Stop to iteratorChunk.Stop
616 currentChunk.Stop = iteratorChunk.Stop;
621 // set currentChunk + 1 to iteratorChunk
622 mergedChunks.push_back(iteratorChunk);
627 // saved merged chunk vector
628 (*binIter).second = mergedChunks;
633 // writes in-memory index data out to file
634 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
635 bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) {
637 // localize parent data
638 if ( m_parent == 0 ) return false;
639 const bool isBigEndian = m_parent->m_isBigEndian;
641 string indexFilename = bamFilename + ".bai";
642 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
643 if ( indexStream == 0 ) {
644 fprintf(stderr, "ERROR: Could not open file to save index.\n");
648 // write BAM index header
649 fwrite("BAI\1", 1, 4, indexStream);
651 // write number of reference sequences
652 int32_t numReferenceSeqs = m_indexData.size();
653 if ( isBigEndian ) SwapEndian_32(numReferenceSeqs);
654 fwrite(&numReferenceSeqs, 4, 1, indexStream);
656 // iterate over reference sequences
657 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
658 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
659 for ( ; indexIter != indexEnd; ++ indexIter ) {
661 // get reference index data
662 const ReferenceIndex& refIndex = (*indexIter);
663 const BamBinMap& binMap = refIndex.Bins;
664 const LinearOffsetVector& offsets = refIndex.Offsets;
666 // write number of bins
667 int32_t binCount = binMap.size();
668 if ( isBigEndian ) SwapEndian_32(binCount);
669 fwrite(&binCount, 4, 1, indexStream);
672 BamBinMap::const_iterator binIter = binMap.begin();
673 BamBinMap::const_iterator binEnd = binMap.end();
674 for ( ; binIter != binEnd; ++binIter ) {
676 // get bin data (key and chunk vector)
677 uint32_t binKey = (*binIter).first;
678 const ChunkVector& binChunks = (*binIter).second;
681 if ( isBigEndian ) SwapEndian_32(binKey);
682 fwrite(&binKey, 4, 1, indexStream);
685 int32_t chunkCount = binChunks.size();
686 if ( isBigEndian ) SwapEndian_32(chunkCount);
687 fwrite(&chunkCount, 4, 1, indexStream);
689 // iterate over chunks
690 ChunkVector::const_iterator chunkIter = binChunks.begin();
691 ChunkVector::const_iterator chunkEnd = binChunks.end();
692 for ( ; chunkIter != chunkEnd; ++chunkIter ) {
694 // get current chunk data
695 const Chunk& chunk = (*chunkIter);
696 uint64_t start = chunk.Start;
697 uint64_t stop = chunk.Stop;
700 SwapEndian_64(start);
704 // save chunk offsets
705 fwrite(&start, 8, 1, indexStream);
706 fwrite(&stop, 8, 1, indexStream);
710 // write linear offsets size
711 int32_t offsetSize = offsets.size();
712 if ( isBigEndian ) SwapEndian_32(offsetSize);
713 fwrite(&offsetSize, 4, 1, indexStream);
715 // iterate over linear offsets
716 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
717 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
718 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
720 // write linear offset value
721 uint64_t linearOffset = (*offsetIter);
722 if ( isBigEndian ) SwapEndian_64(linearOffset);
723 fwrite(&linearOffset, 8, 1, indexStream);
727 // flush buffer, close file, and return success
733 // ---------------------------------------------------------------
734 // BamStandardIndex implementation
736 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
737 : BamIndex(bgzf, reader, isBigEndian)
739 d = new BamStandardIndexPrivate(this);
742 BamStandardIndex::~BamStandardIndex(void) {
747 // creates index data (in-memory) from current reader data
748 bool BamStandardIndex::Build(void) { return d->Build(); }
750 // attempts to use index to jump to region; returns success/fail
751 bool BamStandardIndex::Jump(const BamRegion& region) { return d->Jump(region); }
753 // loads existing data from file into memory
754 bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); }
756 // writes in-memory index data out to file
757 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
758 bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }
760 // #########################################################################################
761 // #########################################################################################
763 // ---------------------------------------------------
764 // BamToolsIndex structs & typedefs
768 // individual index entry
769 struct BamToolsIndexEntry {
772 int32_t MaxEndPosition;
774 int32_t StartPosition;
777 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
778 const int64_t& startOffset = 0,
779 const int32_t& startPosition = 0)
780 : MaxEndPosition(maxEndPosition)
781 , StartOffset(startOffset)
782 , StartPosition(startPosition)
786 // the actual index data structure
787 typedef map<int, vector<BamToolsIndexEntry> > BamToolsIndexData;
789 } // namespace BamTools
791 // ---------------------------------------------------
792 // BamToolsIndexPrivate implementation
794 struct BamToolsIndex::BamToolsIndexPrivate {
796 // keep a list of any supported versions here
797 // (might be useful later to handle any 'legacy' versions if the format changes)
798 // listed for example like: BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
800 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
802 // if ( indexVersion >= BTI_1_2 )
806 enum Version { BTI_1_0 = 1 };
808 // -------------------------
810 BamToolsIndexData m_indexData;
811 BamToolsIndex* m_parent;
815 // -------------------------
818 BamToolsIndexPrivate(BamToolsIndex* parent)
821 , m_version(BTI_1_0) // latest version - used for writing new index files
824 ~BamToolsIndexPrivate(void) { }
826 // -------------------------
829 // creates index data (in-memory) from current reader data
831 // returns supported file extension
832 const std::string Extension(void) const { return std::string(".bti"); }
833 // attempts to use index to jump to region; returns success/fail
834 bool Jump(const BamTools::BamRegion& region);
835 // loads existing data from file into memory
836 bool Load(const std::string& filename);
837 // writes in-memory index data out to file
838 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
839 bool Write(const std::string& bamFilename);
841 // -------------------------
844 // calculates offset(s) for a given region
845 bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets);
848 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
850 // localize parent data
851 if ( m_parent == 0 ) return false;
852 BamReader* reader = m_parent->m_reader;
853 BgzfData* mBGZF = m_parent->m_BGZF;
854 RefVector& references = m_parent->m_references;
856 // be sure reader & BGZF file are valid & open for reading
857 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
860 // move file pointer to beginning of alignments
862 if ( !reader->Rewind() )
865 // set up counters and markers
866 int32_t currentBlockCount = 0;
867 int64_t currentAlignmentOffset = mBGZF->Tell();
868 int32_t blockRefId = 0;
869 int32_t blockMaxEndPosition = 0;
870 int64_t blockStartOffset = currentAlignmentOffset;
871 int32_t blockStartPosition = -1;
873 // plow through alignments, storing index entries
875 while ( reader->GetNextAlignmentCore(al) ) {
877 // if block contains data (not the first time through) AND alignment is on a new reference
878 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
880 // make sure reference flag is set
881 references[blockRefId].RefHasAlignments = true;
883 // store previous data
884 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
886 // intialize new block
887 currentBlockCount = 0;
888 blockMaxEndPosition = al.GetEndPosition();
889 blockStartOffset = currentAlignmentOffset;
892 // if beginning of block, save first alignment's refID & position
893 if ( currentBlockCount == 0 ) {
894 blockRefId = al.RefID;
895 blockStartPosition = al.Position;
898 // increment block counter
901 // check end position
902 int32_t alignmentEndPosition = al.GetEndPosition();
903 if ( alignmentEndPosition > blockMaxEndPosition )
904 blockMaxEndPosition = alignmentEndPosition;
906 // if block is full, get offset for next block, reset currentBlockCount
907 if ( currentBlockCount == m_blockSize ) {
909 // make sure reference flag is set
910 references[blockRefId].RefHasAlignments = true;
912 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
913 blockStartOffset = mBGZF->Tell();
914 currentBlockCount = 0;
917 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
918 // necessary because we won't know if this next alignment is on a new reference until we actually read it
919 currentAlignmentOffset = mBGZF->Tell();
922 // attempt to rewind back to begnning of alignments
923 // return success/fail
924 return reader->Rewind();
927 // N.B. - ignores isRightBoundSpecified
928 bool BamToolsIndex::BamToolsIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
930 // return false if leftBound refID is not found in index data
931 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false;
933 // clear any prior data
936 const vector<BamToolsIndexEntry> referenceOffsets = m_indexData[region.LeftRefID];
937 if ( referenceOffsets.empty() ) return false;
939 // -------------------------------------------------------
940 // calculate nearest index to jump to
943 int64_t offset = (*referenceOffsets.begin()).StartOffset;
944 vector<BamToolsIndexEntry>::const_iterator indexIter = referenceOffsets.begin();
945 vector<BamToolsIndexEntry>::const_iterator indexEnd = referenceOffsets.end();
946 for ( ; indexIter != indexEnd; ++indexIter ) {
947 const BamToolsIndexEntry& entry = (*indexIter);
948 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
949 offset = (*indexIter).StartOffset;
953 if ( indexIter == indexEnd ) return false;
955 //store offset & return success
956 offsets.push_back(offset);
960 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) {
962 // localize parent data
963 if ( m_parent == 0 ) return false;
964 BamReader* reader = m_parent->m_reader;
965 BgzfData* mBGZF = m_parent->m_BGZF;
966 RefVector& references = m_parent->m_references;
968 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
969 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
973 // see if left-bound reference of region has alignments
974 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
976 // make sure left-bound position is valid
977 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
979 vector<int64_t> offsets;
980 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
981 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
985 // iterate through offsets
986 BamAlignment bAlignment;
988 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
990 // attempt seek & load first available alignment
991 result &= mBGZF->Seek(*o);
992 reader->GetNextAlignmentCore(bAlignment);
994 // if this alignment corresponds to desired position
995 // return success of seeking back to 'current offset'
996 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
997 if ( o != offsets.begin() ) --o;
998 return mBGZF->Seek(*o);
1002 if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to seek to offset for specified region.\n");
1006 bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) {
1008 // localize parent data
1009 if ( m_parent == 0 ) return false;
1010 const bool isBigEndian = m_parent->m_isBigEndian;
1011 RefVector& references = m_parent->m_references;
1013 // open index file, abort on error
1014 FILE* indexStream = fopen(filename.c_str(), "rb");
1015 if( !indexStream ) {
1016 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
1020 // set placeholder to receive input byte count (suppresses compiler warnings)
1021 size_t elementsRead = 0;
1023 // see if index is valid BAM index
1025 elementsRead = fread(magic, 1, 4, indexStream);
1026 if ( strncmp(magic, "BTI\1", 4) ) {
1027 fprintf(stderr, "Problem with index file - invalid format.\n");
1028 fclose(indexStream);
1033 int32_t indexVersion;
1034 elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream);
1035 if ( isBigEndian ) SwapEndian_32(indexVersion);
1036 if ( indexVersion <= 0 || indexVersion > m_version ) {
1037 fprintf(stderr, "Problem with index file - unsupported version.\n");
1038 fclose(indexStream);
1042 // read in block size
1043 elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream);
1044 if ( isBigEndian ) SwapEndian_32(m_blockSize);
1046 // read in number of references
1047 int32_t numReferences;
1048 elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream);
1049 if ( isBigEndian ) SwapEndian_32(numReferences);
1051 // iterate over reference entries
1052 for ( int i = 0; i < numReferences; ++i ) {
1054 // read in number of offsets for this reference
1055 uint32_t numOffsets;
1056 elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
1057 if ( isBigEndian ) SwapEndian_32(numOffsets);
1059 // initialize offsets container for this reference
1060 vector<BamToolsIndexEntry> offsets;
1061 offsets.reserve(numOffsets);
1063 // iterate over offset entries
1064 for ( unsigned int j = 0; j < numOffsets; ++j ) {
1067 int32_t maxEndPosition;
1068 int64_t startOffset;
1069 int32_t startPosition;
1072 elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1073 elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream);
1074 elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream);
1076 // swap endian-ness if necessary
1077 if ( isBigEndian ) {
1078 SwapEndian_32(maxEndPosition);
1079 SwapEndian_64(startOffset);
1080 SwapEndian_32(startPosition);
1083 // save current index entry
1084 offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) );
1087 // save refID, offsetVector entry into data structure
1088 m_indexData.insert( make_pair(i, offsets) );
1090 // set ref.HasAlignments flag
1091 references[i].RefHasAlignments = (numOffsets != 0);
1094 // close index file and return
1095 fclose(indexStream);
1099 // writes in-memory index data out to file
1100 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1101 bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) {
1103 // localize parent data
1104 if ( m_parent == 0 ) return false;
1105 const bool isBigEndian = m_parent->m_isBigEndian;
1107 // open index file for writing
1108 string indexFilename = bamFilename + ".bti";
1109 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
1110 if ( indexStream == 0 ) {
1111 fprintf(stderr, "ERROR: Could not open file to save index.\n");
1115 // write BTI index format 'magic number'
1116 fwrite("BTI\1", 1, 4, indexStream);
1118 // write BTI index format version
1119 int32_t currentVersion = (int32_t)m_version;
1120 if ( isBigEndian ) SwapEndian_32(currentVersion);
1121 fwrite(¤tVersion, sizeof(int32_t), 1, indexStream);
1124 int32_t blockSize = m_blockSize;
1125 if ( isBigEndian ) SwapEndian_32(blockSize);
1126 fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
1128 // write number of references
1129 int32_t numReferences = (int32_t)m_indexData.size();
1130 if ( isBigEndian ) SwapEndian_32(numReferences);
1131 fwrite(&numReferences, sizeof(numReferences), 1, indexStream);
1133 // iterate through references in index
1134 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1135 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1136 for ( ; refIter != refEnd; ++refIter ) {
1138 const vector<BamToolsIndexEntry> offsets = (*refIter).second;
1140 // write number of offsets listed for this reference
1141 uint32_t numOffsets = offsets.size();
1142 if ( isBigEndian ) SwapEndian_32(numOffsets);
1143 fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
1145 // iterate over offset entries
1146 vector<BamToolsIndexEntry>::const_iterator offsetIter = offsets.begin();
1147 vector<BamToolsIndexEntry>::const_iterator offsetEnd = offsets.end();
1148 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1150 // get reference index data
1151 const BamToolsIndexEntry& entry = (*offsetIter);
1154 int32_t maxEndPosition = entry.MaxEndPosition;
1155 int64_t startOffset = entry.StartOffset;
1156 int32_t startPosition = entry.StartPosition;
1158 // swap endian-ness if necessary
1159 if ( isBigEndian ) {
1160 SwapEndian_32(maxEndPosition);
1161 SwapEndian_64(startOffset);
1162 SwapEndian_32(startPosition);
1165 // write the reference index entry
1166 fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1167 fwrite(&startOffset, sizeof(startOffset), 1, indexStream);
1168 fwrite(&startPosition, sizeof(startPosition), 1, indexStream);
1172 // flush any remaining output, close file, and return success
1173 fflush(indexStream);
1174 fclose(indexStream);
1178 // ---------------------------------------------------
1179 // BamToolsIndex implementation
1181 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
1182 : BamIndex(bgzf, reader, isBigEndian)
1184 d = new BamToolsIndexPrivate(this);
1187 BamToolsIndex::~BamToolsIndex(void) {
1192 // creates index data (in-memory) from current reader data
1193 bool BamToolsIndex::Build(void) { return d->Build(); }
1195 // attempts to use index to jump to region; returns success/fail
1196 bool BamToolsIndex::Jump(const BamRegion& region) { return d->Jump(region); }
1198 // loads existing data from file into memory
1199 bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); }
1201 // writes in-memory index data out to file
1202 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1203 bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }