1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 10 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
27 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
30 , m_isBigEndian(isBigEndian)
32 if ( m_reader && m_reader->IsOpen() )
33 m_references = m_reader->GetReferenceData();
36 bool BamIndex::HasAlignments(const int& referenceID) {
38 // return false if invalid ID
39 if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
42 // else return status of reference (has alignments?)
44 return m_references.at(referenceID).RefHasAlignments;
47 // #########################################################################################
48 // #########################################################################################
50 // -------------------------------
51 // BamStandardIndex structs & typedefs
55 // --------------------------------------------------
56 // BamStandardIndex data structures & typedefs
64 Chunk(const uint64_t& start = 0,
65 const uint64_t& stop = 0)
71 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
72 return lhs.Start < rhs.Start;
75 typedef vector<Chunk> ChunkVector;
76 typedef map<uint32_t, ChunkVector> BamBinMap;
77 typedef vector<uint64_t> LinearOffsetVector;
79 struct ReferenceIndex {
83 LinearOffsetVector Offsets;
86 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
87 const LinearOffsetVector& offsets = LinearOffsetVector())
93 typedef vector<ReferenceIndex> BamStandardIndexData;
95 } // namespace BamTools
97 // -------------------------------
98 // BamStandardIndexPrivate implementation
100 struct BamStandardIndex::BamStandardIndexPrivate {
102 // -------------------------
105 BamStandardIndexData m_indexData;
106 BamStandardIndex* m_parent;
108 // -------------------------
111 BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { }
112 ~BamStandardIndexPrivate(void) { m_indexData.clear(); }
114 // -------------------------
117 // creates index data (in-memory) from current reader data
119 // attempts to use index to jump to region; returns success/fail
120 bool Jump(const BamTools::BamRegion& region);
121 // loads existing data from file into memory
122 bool Load(const std::string& filename);
123 // writes in-memory index data out to file
124 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
125 bool Write(const std::string& bamFilename);
127 // -------------------------
130 // calculate bins that overlap region
131 int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
132 // calculates offset(s) for a given region
133 bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets);
134 // saves BAM bin entry for index
135 void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
136 // saves linear offset entry for index
137 void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
138 // simplifies index by merging 'chunks'
139 void MergeChunks(void);
142 // calculate bins that overlap region
143 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
145 // get region boundaries
146 uint32_t begin = (unsigned int)region.LeftPosition;
149 // if right bound specified AND left&right bounds are on same reference
150 // OK to use right bound position
151 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
152 end = (unsigned int)region.RightPosition;
154 // otherwise, use end of left bound reference as cutoff
156 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
158 // initialize list, bin '0' always a valid bin
162 // get rest of bins that contain this region
164 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
165 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
166 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
167 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
168 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
170 // return number of bins stored
174 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
176 // localize parent data
177 if ( m_parent == 0 ) return false;
178 BamReader* reader = m_parent->m_reader;
179 BgzfData* mBGZF = m_parent->m_BGZF;
180 RefVector& references = m_parent->m_references;
182 // be sure reader & BGZF file are valid & open for reading
183 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
186 // move file pointer to beginning of alignments
189 // get reference count, reserve index space
190 int numReferences = (int)references.size();
191 for ( int i = 0; i < numReferences; ++i )
192 m_indexData.push_back(ReferenceIndex());
194 // sets default constant for bin, ID, offset, coordinate variables
195 const uint32_t defaultValue = 0xffffffffu;
198 uint32_t saveBin(defaultValue);
199 uint32_t lastBin(defaultValue);
202 int32_t saveRefID(defaultValue);
203 int32_t lastRefID(defaultValue);
206 uint64_t saveOffset = mBGZF->Tell();
207 uint64_t lastOffset = saveOffset;
210 int32_t lastCoordinate = defaultValue;
212 BamAlignment bAlignment;
213 while ( reader->GetNextAlignmentCore(bAlignment) ) {
215 // change of chromosome, save ID, reset bin
216 if ( lastRefID != bAlignment.RefID ) {
217 lastRefID = bAlignment.RefID;
218 lastBin = defaultValue;
221 // if lastCoordinate greater than BAM position - file not sorted properly
222 else if ( lastCoordinate > bAlignment.Position ) {
223 fprintf(stderr, "BAM file not properly sorted:\n");
224 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
228 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
229 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
231 // save linear offset entry (matched to BAM entry refID)
232 ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
233 LinearOffsetVector& offsets = refIndex.Offsets;
234 InsertLinearOffset(offsets, bAlignment, lastOffset);
237 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
238 if ( bAlignment.Bin != lastBin ) {
240 // if not first time through
241 if ( saveBin != defaultValue ) {
243 // save Bam bin entry
244 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
245 BamBinMap& binMap = refIndex.Bins;
246 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
250 saveOffset = lastOffset;
253 saveBin = bAlignment.Bin;
254 lastBin = bAlignment.Bin;
257 saveRefID = bAlignment.RefID;
259 // if invalid RefID, break out
260 if ( saveRefID < 0 ) break;
263 // make sure that current file pointer is beyond lastOffset
264 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
265 fprintf(stderr, "Error in BGZF offsets.\n");
270 lastOffset = mBGZF->Tell();
272 // update lastCoordinate
273 lastCoordinate = bAlignment.Position;
276 // save any leftover BAM data (as long as refID is valid)
277 if ( saveRefID >= 0 ) {
278 // save Bam bin entry
279 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
280 BamBinMap& binMap = refIndex.Bins;
281 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
284 // simplify index by merging chunks
287 // iterate through references in index
288 // store whether reference has data &
289 // sort offsets in linear offset vector
290 BamStandardIndexData::iterator indexIter = m_indexData.begin();
291 BamStandardIndexData::iterator indexEnd = m_indexData.end();
292 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
294 // get reference index data
295 ReferenceIndex& refIndex = (*indexIter);
296 BamBinMap& binMap = refIndex.Bins;
297 LinearOffsetVector& offsets = refIndex.Offsets;
299 // store whether reference has alignments or no
300 references[i].RefHasAlignments = ( binMap.size() > 0 );
302 // sort linear offsets
303 sort(offsets.begin(), offsets.end());
306 // rewind file pointer to beginning of alignments, return success/fail
307 return reader->Rewind();
310 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
312 // calculate which bins overlap this region
313 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
314 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
316 // get bins for this reference
317 const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID);
318 const BamBinMap& binMap = refIndex.Bins;
320 // get minimum offset to consider
321 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
322 uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
324 // store all alignment 'chunk' starts (file offsets) for bins in this region
325 for ( int i = 0; i < numBins; ++i ) {
327 const uint16_t binKey = bins[i];
328 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
329 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
331 // iterate over chunks
332 const ChunkVector& chunks = (*binIter).second;
333 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
334 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
335 for ( ; chunksIter != chunksEnd; ++chunksIter) {
337 // if valid chunk found, store its file offset
338 const Chunk& chunk = (*chunksIter);
339 if ( chunk.Stop > minOffset )
340 offsets.push_back( chunk.Start );
348 // sort the offsets before returning
349 sort(offsets.begin(), offsets.end());
351 // return whether any offsets were found
352 return ( offsets.size() != 0 );
355 // saves BAM bin entry for index
356 void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap,
357 const uint32_t& saveBin,
358 const uint64_t& saveOffset,
359 const uint64_t& lastOffset)
362 BamBinMap::iterator binIter = binMap.find(saveBin);
365 Chunk newChunk(saveOffset, lastOffset);
367 // if entry doesn't exist
368 if ( binIter == binMap.end() ) {
369 ChunkVector newChunks;
370 newChunks.push_back(newChunk);
371 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
376 ChunkVector& binChunks = (*binIter).second;
377 binChunks.push_back( newChunk );
381 // saves linear offset entry for index
382 void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
383 const BamAlignment& bAlignment,
384 const uint64_t& lastOffset)
386 // get converted offsets
387 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
388 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
390 // resize vector if necessary
391 int oldSize = offsets.size();
392 int newSize = endOffset + 1;
393 if ( oldSize < newSize )
394 offsets.resize(newSize, 0);
397 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
398 if ( offsets[i] == 0 )
399 offsets[i] = lastOffset;
403 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) {
405 // localize parent data
406 if ( m_parent == 0 ) return false;
407 BamReader* reader = m_parent->m_reader;
408 BgzfData* mBGZF = m_parent->m_BGZF;
409 RefVector& references = m_parent->m_references;
411 // be sure reader & BGZF file are valid & open for reading
412 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
415 // see if left-bound reference of region has alignments
416 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
418 // make sure left-bound position is valid
419 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
421 vector<int64_t> offsets;
422 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
423 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
427 // iterate through offsets
428 BamAlignment bAlignment;
430 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
432 // attempt seek & load first available alignment
433 result &= mBGZF->Seek(*o);
434 reader->GetNextAlignmentCore(bAlignment);
436 // if this alignment corresponds to desired position
437 // return success of seeking back to 'current offset'
438 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
439 if ( o != offsets.begin() ) --o;
440 return mBGZF->Seek(*o);
444 if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
448 bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) {
450 // localize parent data
451 if ( m_parent == 0 ) return false;
452 const bool isBigEndian = m_parent->m_isBigEndian;
453 RefVector& references = m_parent->m_references;
455 // open index file, abort on error
456 FILE* indexStream = fopen(filename.c_str(), "rb");
458 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
462 // set placeholder to receive input byte count (suppresses compiler warnings)
463 size_t elementsRead = 0;
465 // see if index is valid BAM index
467 elementsRead = fread(magic, 1, 4, indexStream);
468 if ( strncmp(magic, "BAI\1", 4) ) {
469 fprintf(stderr, "Problem with index file - invalid format.\n");
474 // get number of reference sequences
476 elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
477 if ( isBigEndian ) SwapEndian_32(numRefSeqs);
479 // intialize space for BamStandardIndexData data structure
480 m_indexData.reserve(numRefSeqs);
482 // iterate over reference sequences
483 for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
485 // get number of bins for this reference sequence
487 elementsRead = fread(&numBins, 4, 1, indexStream);
488 if ( isBigEndian ) SwapEndian_32(numBins);
491 RefData& refEntry = references[i];
492 refEntry.RefHasAlignments = true;
495 // intialize BinVector
498 // iterate over bins for that reference sequence
499 for ( int j = 0; j < numBins; ++j ) {
503 elementsRead = fread(&binID, 4, 1, indexStream);
505 // get number of regionChunks in this bin
507 elementsRead = fread(&numChunks, 4, 1, indexStream);
510 SwapEndian_32(binID);
511 SwapEndian_32(numChunks);
514 // intialize ChunkVector
515 ChunkVector regionChunks;
516 regionChunks.reserve(numChunks);
518 // iterate over regionChunks in this bin
519 for ( unsigned int k = 0; k < numChunks; ++k ) {
521 // get chunk boundaries (left, right)
524 elementsRead = fread(&left, 8, 1, indexStream);
525 elementsRead = fread(&right, 8, 1, indexStream);
529 SwapEndian_64(right);
533 regionChunks.push_back( Chunk(left, right) );
536 // sort chunks for this bin
537 sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
539 // save binID, chunkVector for this bin
540 binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
543 // -----------------------------------------------------
544 // load linear index for this reference sequence
546 // get number of linear offsets
547 int32_t numLinearOffsets;
548 elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
549 if ( isBigEndian ) SwapEndian_32(numLinearOffsets);
551 // intialize LinearOffsetVector
552 LinearOffsetVector offsets;
553 offsets.reserve(numLinearOffsets);
555 // iterate over linear offsets for this reference sequeence
556 uint64_t linearOffset;
557 for ( int j = 0; j < numLinearOffsets; ++j ) {
558 // read a linear offset & store
559 elementsRead = fread(&linearOffset, 8, 1, indexStream);
560 if ( isBigEndian ) SwapEndian_64(linearOffset);
561 offsets.push_back(linearOffset);
564 // sort linear offsets
565 sort( offsets.begin(), offsets.end() );
567 // store index data for that reference sequence
568 m_indexData.push_back( ReferenceIndex(binMap, offsets) );
571 // close index file (.bai) and return
576 // merges 'alignment chunks' in BAM bin (used for index building)
577 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
579 // iterate over reference enties
580 BamStandardIndexData::iterator indexIter = m_indexData.begin();
581 BamStandardIndexData::iterator indexEnd = m_indexData.end();
582 for ( ; indexIter != indexEnd; ++indexIter ) {
584 // get BAM bin map for this reference
585 ReferenceIndex& refIndex = (*indexIter);
586 BamBinMap& bamBinMap = refIndex.Bins;
588 // iterate over BAM bins
589 BamBinMap::iterator binIter = bamBinMap.begin();
590 BamBinMap::iterator binEnd = bamBinMap.end();
591 for ( ; binIter != binEnd; ++binIter ) {
593 // get chunk vector for this bin
594 ChunkVector& binChunks = (*binIter).second;
595 if ( binChunks.size() == 0 ) continue;
597 ChunkVector mergedChunks;
598 mergedChunks.push_back( binChunks[0] );
600 // iterate over chunks
602 ChunkVector::iterator chunkIter = binChunks.begin();
603 ChunkVector::iterator chunkEnd = binChunks.end();
604 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
606 // get 'currentChunk' based on numeric index
607 Chunk& currentChunk = mergedChunks[i];
609 // get iteratorChunk based on vector iterator
610 Chunk& iteratorChunk = (*chunkIter);
612 // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
613 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
615 // set currentChunk.Stop to iteratorChunk.Stop
616 currentChunk.Stop = iteratorChunk.Stop;
621 // set currentChunk + 1 to iteratorChunk
622 mergedChunks.push_back(iteratorChunk);
627 // saved merged chunk vector
628 (*binIter).second = mergedChunks;
633 // writes in-memory index data out to file
634 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
635 bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) {
637 // localize parent data
638 if ( m_parent == 0 ) return false;
639 const bool isBigEndian = m_parent->m_isBigEndian;
641 string indexFilename = bamFilename + ".bai";
642 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
643 if ( indexStream == 0 ) {
644 fprintf(stderr, "ERROR: Could not open file to save index.\n");
648 // write BAM index header
649 fwrite("BAI\1", 1, 4, indexStream);
651 // write number of reference sequences
652 int32_t numReferenceSeqs = m_indexData.size();
653 if ( isBigEndian ) SwapEndian_32(numReferenceSeqs);
654 fwrite(&numReferenceSeqs, 4, 1, indexStream);
656 // iterate over reference sequences
657 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
658 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
659 for ( ; indexIter != indexEnd; ++ indexIter ) {
661 // get reference index data
662 const ReferenceIndex& refIndex = (*indexIter);
663 const BamBinMap& binMap = refIndex.Bins;
664 const LinearOffsetVector& offsets = refIndex.Offsets;
666 // write number of bins
667 int32_t binCount = binMap.size();
668 if ( isBigEndian ) SwapEndian_32(binCount);
669 fwrite(&binCount, 4, 1, indexStream);
672 BamBinMap::const_iterator binIter = binMap.begin();
673 BamBinMap::const_iterator binEnd = binMap.end();
674 for ( ; binIter != binEnd; ++binIter ) {
676 // get bin data (key and chunk vector)
677 uint32_t binKey = (*binIter).first;
678 const ChunkVector& binChunks = (*binIter).second;
681 if ( isBigEndian ) SwapEndian_32(binKey);
682 fwrite(&binKey, 4, 1, indexStream);
685 int32_t chunkCount = binChunks.size();
686 if ( isBigEndian ) SwapEndian_32(chunkCount);
687 fwrite(&chunkCount, 4, 1, indexStream);
689 // iterate over chunks
690 ChunkVector::const_iterator chunkIter = binChunks.begin();
691 ChunkVector::const_iterator chunkEnd = binChunks.end();
692 for ( ; chunkIter != chunkEnd; ++chunkIter ) {
694 // get current chunk data
695 const Chunk& chunk = (*chunkIter);
696 uint64_t start = chunk.Start;
697 uint64_t stop = chunk.Stop;
700 SwapEndian_64(start);
704 // save chunk offsets
705 fwrite(&start, 8, 1, indexStream);
706 fwrite(&stop, 8, 1, indexStream);
710 // write linear offsets size
711 int32_t offsetSize = offsets.size();
712 if ( isBigEndian ) SwapEndian_32(offsetSize);
713 fwrite(&offsetSize, 4, 1, indexStream);
715 // iterate over linear offsets
716 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
717 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
718 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
720 // write linear offset value
721 uint64_t linearOffset = (*offsetIter);
722 if ( isBigEndian ) SwapEndian_64(linearOffset);
723 fwrite(&linearOffset, 8, 1, indexStream);
727 // flush buffer, close file, and return success
733 // ---------------------------------------------------------------
734 // BamStandardIndex implementation
736 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
737 : BamIndex(bgzf, reader, isBigEndian)
739 d = new BamStandardIndexPrivate(this);
742 BamStandardIndex::~BamStandardIndex(void) {
747 // creates index data (in-memory) from current reader data
748 bool BamStandardIndex::Build(void) { return d->Build(); }
750 // attempts to use index to jump to region; returns success/fail
751 bool BamStandardIndex::Jump(const BamRegion& region) { return d->Jump(region); }
753 // loads existing data from file into memory
754 bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); }
756 // writes in-memory index data out to file
757 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
758 bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }
760 // #########################################################################################
761 // #########################################################################################
763 // ---------------------------------------------------
764 // BamToolsIndex structs & typedefs
768 // individual index entry
769 struct BamToolsIndexEntry {
772 int32_t MaxEndPosition;
774 int32_t StartPosition;
777 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
778 const int64_t& startOffset = 0,
779 const int32_t& startPosition = 0)
780 : MaxEndPosition(maxEndPosition)
781 , StartOffset(startOffset)
782 , StartPosition(startPosition)
786 // the actual index data structure
787 typedef map<int, vector<BamToolsIndexEntry> > BamToolsIndexData;
789 } // namespace BamTools
791 // ---------------------------------------------------
792 // BamToolsIndexPrivate implementation
794 struct BamToolsIndex::BamToolsIndexPrivate {
796 // keep a list of any supported versions here
797 // (might be useful later to handle any 'legacy' versions if the format changes)
798 // listed for example like: BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
800 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
802 // if ( indexVersion >= BTI_1_2 )
806 enum Version { BTI_1_0 = 1 };
808 // -------------------------
810 BamToolsIndexData m_indexData;
811 BamToolsIndex* m_parent;
815 // -------------------------
818 BamToolsIndexPrivate(BamToolsIndex* parent)
821 , m_version(BTI_1_0) // latest version - used for writing new index files
824 ~BamToolsIndexPrivate(void) { }
826 // -------------------------
829 // creates index data (in-memory) from current reader data
831 // returns supported file extension
832 const std::string Extension(void) const { return std::string(".bti"); }
833 // attempts to use index to jump to region; returns success/fail
834 bool Jump(const BamTools::BamRegion& region);
835 // loads existing data from file into memory
836 bool Load(const std::string& filename);
837 // writes in-memory index data out to file
838 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
839 bool Write(const std::string& bamFilename);
841 // -------------------------
844 // calculates offset for a given region
845 bool GetOffset(const BamRegion& region, int64_t& offset);
848 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
850 // localize parent data
851 if ( m_parent == 0 ) return false;
852 BamReader* reader = m_parent->m_reader;
853 BgzfData* mBGZF = m_parent->m_BGZF;
854 RefVector& references = m_parent->m_references;
856 // be sure reader & BGZF file are valid & open for reading
857 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
860 // move file pointer to beginning of alignments
862 if ( !reader->Rewind() )
865 // set up counters and markers
866 int32_t currentBlockCount = 0;
867 int64_t currentAlignmentOffset = mBGZF->Tell();
868 int32_t blockRefId = 0;
869 int32_t blockMaxEndPosition = 0;
870 int64_t blockStartOffset = currentAlignmentOffset;
871 int32_t blockStartPosition = -1;
873 // plow through alignments, storing index entries
875 while ( reader->GetNextAlignmentCore(al) ) {
877 // if block contains data (not the first time through) AND alignment is on a new reference
878 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
880 // make sure reference flag is set
881 references[blockRefId].RefHasAlignments = true;
883 // store previous data
884 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
886 // intialize new block
887 currentBlockCount = 0;
888 blockMaxEndPosition = al.GetEndPosition();
889 blockStartOffset = currentAlignmentOffset;
892 // if beginning of block, save first alignment's refID & position
893 if ( currentBlockCount == 0 ) {
894 blockRefId = al.RefID;
895 blockStartPosition = al.Position;
898 // increment block counter
901 // check end position
902 int32_t alignmentEndPosition = al.GetEndPosition();
903 if ( alignmentEndPosition > blockMaxEndPosition )
904 blockMaxEndPosition = alignmentEndPosition;
906 // if block is full, get offset for next block, reset currentBlockCount
907 if ( currentBlockCount == m_blockSize ) {
909 // make sure reference flag is set
910 references[blockRefId].RefHasAlignments = true;
912 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
913 blockStartOffset = mBGZF->Tell();
914 currentBlockCount = 0;
917 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
918 // necessary because we won't know if this next alignment is on a new reference until we actually read it
919 currentAlignmentOffset = mBGZF->Tell();
922 // attempt to rewind back to begnning of alignments
923 // return success/fail
924 return reader->Rewind();
927 // N.B. - ignores isRightBoundSpecified
928 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset) {
930 // return false if leftBound refID is not found in index data
931 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false;
933 const vector<BamToolsIndexEntry> referenceOffsets = m_indexData[region.LeftRefID];
934 if ( referenceOffsets.empty() ) return false;
936 // -------------------------------------------------------
937 // calculate nearest index to jump to
940 offset = (*referenceOffsets.begin()).StartOffset;
941 vector<BamToolsIndexEntry>::const_iterator indexIter = referenceOffsets.begin();
942 vector<BamToolsIndexEntry>::const_iterator indexEnd = referenceOffsets.end();
943 for ( ; indexIter != indexEnd; ++indexIter ) {
944 const BamToolsIndexEntry& entry = (*indexIter);
945 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
946 offset = (*indexIter).StartOffset;
950 if ( indexIter == indexEnd ) return false;
956 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) {
958 // localize parent data
959 if ( m_parent == 0 ) return false;
960 BamReader* reader = m_parent->m_reader;
961 BgzfData* mBGZF = m_parent->m_BGZF;
962 RefVector& references = m_parent->m_references;
964 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
965 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
969 // see if left-bound reference of region has alignments
970 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
972 // make sure left-bound position is valid
973 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
975 // calculate nearest offset to jump to
977 if ( !GetOffset(region, offset) ) {
978 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
982 // attempt seek in file, return success/fail
983 return mBGZF->Seek(offset);
986 bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) {
988 // localize parent data
989 if ( m_parent == 0 ) return false;
990 const bool isBigEndian = m_parent->m_isBigEndian;
991 RefVector& references = m_parent->m_references;
993 // open index file, abort on error
994 FILE* indexStream = fopen(filename.c_str(), "rb");
996 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
1000 // set placeholder to receive input byte count (suppresses compiler warnings)
1001 size_t elementsRead = 0;
1003 // see if index is valid BAM index
1005 elementsRead = fread(magic, 1, 4, indexStream);
1006 if ( strncmp(magic, "BTI\1", 4) ) {
1007 fprintf(stderr, "Problem with index file - invalid format.\n");
1008 fclose(indexStream);
1013 int32_t indexVersion;
1014 elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream);
1015 if ( isBigEndian ) SwapEndian_32(indexVersion);
1016 if ( indexVersion <= 0 || indexVersion > m_version ) {
1017 fprintf(stderr, "Problem with index file - unsupported version.\n");
1018 fclose(indexStream);
1022 // read in block size
1023 elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream);
1024 if ( isBigEndian ) SwapEndian_32(m_blockSize);
1026 // read in number of references
1027 int32_t numReferences;
1028 elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream);
1029 if ( isBigEndian ) SwapEndian_32(numReferences);
1031 // iterate over reference entries
1032 for ( int i = 0; i < numReferences; ++i ) {
1034 // read in number of offsets for this reference
1035 uint32_t numOffsets;
1036 elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
1037 if ( isBigEndian ) SwapEndian_32(numOffsets);
1039 // initialize offsets container for this reference
1040 vector<BamToolsIndexEntry> offsets;
1041 offsets.reserve(numOffsets);
1043 // iterate over offset entries
1044 for ( unsigned int j = 0; j < numOffsets; ++j ) {
1047 int32_t maxEndPosition;
1048 int64_t startOffset;
1049 int32_t startPosition;
1052 elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1053 elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream);
1054 elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream);
1056 // swap endian-ness if necessary
1057 if ( isBigEndian ) {
1058 SwapEndian_32(maxEndPosition);
1059 SwapEndian_64(startOffset);
1060 SwapEndian_32(startPosition);
1063 // save current index entry
1064 offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) );
1067 // save refID, offsetVector entry into data structure
1068 m_indexData.insert( make_pair(i, offsets) );
1070 // set ref.HasAlignments flag
1071 references[i].RefHasAlignments = (numOffsets != 0);
1074 // close index file and return
1075 fclose(indexStream);
1079 // writes in-memory index data out to file
1080 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1081 bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) {
1083 // localize parent data
1084 if ( m_parent == 0 ) return false;
1085 const bool isBigEndian = m_parent->m_isBigEndian;
1087 // open index file for writing
1088 string indexFilename = bamFilename + ".bti";
1089 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
1090 if ( indexStream == 0 ) {
1091 fprintf(stderr, "ERROR: Could not open file to save index.\n");
1095 // write BTI index format 'magic number'
1096 fwrite("BTI\1", 1, 4, indexStream);
1098 // write BTI index format version
1099 int32_t currentVersion = (int32_t)m_version;
1100 if ( isBigEndian ) SwapEndian_32(currentVersion);
1101 fwrite(¤tVersion, sizeof(int32_t), 1, indexStream);
1104 int32_t blockSize = m_blockSize;
1105 if ( isBigEndian ) SwapEndian_32(blockSize);
1106 fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
1108 // write number of references
1109 int32_t numReferences = (int32_t)m_indexData.size();
1110 if ( isBigEndian ) SwapEndian_32(numReferences);
1111 fwrite(&numReferences, sizeof(numReferences), 1, indexStream);
1113 // iterate through references in index
1114 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1115 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1116 for ( ; refIter != refEnd; ++refIter ) {
1118 const vector<BamToolsIndexEntry> offsets = (*refIter).second;
1120 // write number of offsets listed for this reference
1121 uint32_t numOffsets = offsets.size();
1122 if ( isBigEndian ) SwapEndian_32(numOffsets);
1123 fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
1125 // iterate over offset entries
1126 vector<BamToolsIndexEntry>::const_iterator offsetIter = offsets.begin();
1127 vector<BamToolsIndexEntry>::const_iterator offsetEnd = offsets.end();
1128 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1130 // get reference index data
1131 const BamToolsIndexEntry& entry = (*offsetIter);
1134 int32_t maxEndPosition = entry.MaxEndPosition;
1135 int64_t startOffset = entry.StartOffset;
1136 int32_t startPosition = entry.StartPosition;
1138 // swap endian-ness if necessary
1139 if ( isBigEndian ) {
1140 SwapEndian_32(maxEndPosition);
1141 SwapEndian_64(startOffset);
1142 SwapEndian_32(startPosition);
1145 // write the reference index entry
1146 fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1147 fwrite(&startOffset, sizeof(startOffset), 1, indexStream);
1148 fwrite(&startPosition, sizeof(startPosition), 1, indexStream);
1152 // flush any remaining output, close file, and return success
1153 fflush(indexStream);
1154 fclose(indexStream);
1158 // ---------------------------------------------------
1159 // BamToolsIndex implementation
1161 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
1162 : BamIndex(bgzf, reader, isBigEndian)
1164 d = new BamToolsIndexPrivate(this);
1167 BamToolsIndex::~BamToolsIndex(void) {
1172 // creates index data (in-memory) from current reader data
1173 bool BamToolsIndex::Build(void) { return d->Build(); }
1175 // attempts to use index to jump to region; returns success/fail
1176 bool BamToolsIndex::Jump(const BamRegion& region) { return d->Jump(region); }
1178 // loads existing data from file into memory
1179 bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); }
1181 // writes in-memory index data out to file
1182 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1183 bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }