1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 16 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
27 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
30 , m_isBigEndian(isBigEndian)
32 if ( m_reader && m_reader->IsOpen() )
33 m_references = m_reader->GetReferenceData();
36 bool BamIndex::HasAlignments(const int& referenceID) {
38 // return false if invalid ID
39 if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
42 // else return status of reference (has alignments?)
44 return m_references.at(referenceID).RefHasAlignments;
47 // #########################################################################################
48 // #########################################################################################
50 // -------------------------------
51 // BamStandardIndex structs & typedefs
55 // BAM index constants
56 const int MAX_BIN = 37450; // =(8^6-1)/7+1
57 const int BAM_LIDX_SHIFT = 14;
59 // --------------------------------------------------
60 // BamStandardIndex data structures & typedefs
68 Chunk(const uint64_t& start = 0,
69 const uint64_t& stop = 0)
75 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
76 return lhs.Start < rhs.Start;
79 typedef vector<Chunk> ChunkVector;
80 typedef map<uint32_t, ChunkVector> BamBinMap;
81 typedef vector<uint64_t> LinearOffsetVector;
83 struct ReferenceIndex {
87 LinearOffsetVector Offsets;
90 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
91 const LinearOffsetVector& offsets = LinearOffsetVector())
97 typedef vector<ReferenceIndex> BamStandardIndexData;
99 } // namespace BamTools
101 // -------------------------------
102 // BamStandardIndexPrivate implementation
104 struct BamStandardIndex::BamStandardIndexPrivate {
106 // -------------------------
109 BamStandardIndexData m_indexData;
110 BamStandardIndex* m_parent;
112 // -------------------------
115 BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { }
116 ~BamStandardIndexPrivate(void) { m_indexData.clear(); }
118 // -------------------------
121 // creates index data (in-memory) from current reader data
123 // attempts to use index to jump to region; returns success/fail
124 bool Jump(const BamTools::BamRegion& region);
125 // loads existing data from file into memory
126 bool Load(const std::string& filename);
127 // writes in-memory index data out to file
128 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
129 bool Write(const std::string& bamFilename);
131 // -------------------------
134 // calculate bins that overlap region
135 int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
136 // calculates offset(s) for a given region
137 bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets);
138 // saves BAM bin entry for index
139 void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
140 // saves linear offset entry for index
141 void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
142 // simplifies index by merging 'chunks'
143 void MergeChunks(void);
146 // calculate bins that overlap region
147 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
149 // get region boundaries
150 uint32_t begin = (unsigned int)region.LeftPosition;
153 // if right bound specified AND left&right bounds are on same reference
154 // OK to use right bound position
155 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
156 end = (unsigned int)region.RightPosition;
158 // otherwise, use end of left bound reference as cutoff
160 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
162 // initialize list, bin '0' always a valid bin
166 // get rest of bins that contain this region
168 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
169 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
170 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
171 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
172 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
174 // return number of bins stored
178 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
180 // localize parent data
181 if ( m_parent == 0 ) return false;
182 BamReader* reader = m_parent->m_reader;
183 BgzfData* mBGZF = m_parent->m_BGZF;
184 RefVector& references = m_parent->m_references;
186 // be sure reader & BGZF file are valid & open for reading
187 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
190 // move file pointer to beginning of alignments
193 // get reference count, reserve index space
194 int numReferences = (int)references.size();
195 for ( int i = 0; i < numReferences; ++i )
196 m_indexData.push_back(ReferenceIndex());
198 // sets default constant for bin, ID, offset, coordinate variables
199 const uint32_t defaultValue = 0xffffffffu;
202 uint32_t saveBin(defaultValue);
203 uint32_t lastBin(defaultValue);
206 int32_t saveRefID(defaultValue);
207 int32_t lastRefID(defaultValue);
210 uint64_t saveOffset = mBGZF->Tell();
211 uint64_t lastOffset = saveOffset;
214 int32_t lastCoordinate = defaultValue;
216 BamAlignment bAlignment;
217 while ( reader->GetNextAlignmentCore(bAlignment) ) {
219 // change of chromosome, save ID, reset bin
220 if ( lastRefID != bAlignment.RefID ) {
221 lastRefID = bAlignment.RefID;
222 lastBin = defaultValue;
225 // if lastCoordinate greater than BAM position - file not sorted properly
226 else if ( lastCoordinate > bAlignment.Position ) {
227 fprintf(stderr, "BAM file not properly sorted:\n");
228 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
232 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
233 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
235 // save linear offset entry (matched to BAM entry refID)
236 ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
237 LinearOffsetVector& offsets = refIndex.Offsets;
238 InsertLinearOffset(offsets, bAlignment, lastOffset);
241 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
242 if ( bAlignment.Bin != lastBin ) {
244 // if not first time through
245 if ( saveBin != defaultValue ) {
247 // save Bam bin entry
248 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
249 BamBinMap& binMap = refIndex.Bins;
250 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
254 saveOffset = lastOffset;
257 saveBin = bAlignment.Bin;
258 lastBin = bAlignment.Bin;
261 saveRefID = bAlignment.RefID;
263 // if invalid RefID, break out
264 if ( saveRefID < 0 ) break;
267 // make sure that current file pointer is beyond lastOffset
268 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
269 fprintf(stderr, "Error in BGZF offsets.\n");
274 lastOffset = mBGZF->Tell();
276 // update lastCoordinate
277 lastCoordinate = bAlignment.Position;
280 // save any leftover BAM data (as long as refID is valid)
281 if ( saveRefID >= 0 ) {
282 // save Bam bin entry
283 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
284 BamBinMap& binMap = refIndex.Bins;
285 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
288 // simplify index by merging chunks
291 // iterate through references in index
292 // store whether reference has data &
293 // sort offsets in linear offset vector
294 BamStandardIndexData::iterator indexIter = m_indexData.begin();
295 BamStandardIndexData::iterator indexEnd = m_indexData.end();
296 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
298 // get reference index data
299 ReferenceIndex& refIndex = (*indexIter);
300 BamBinMap& binMap = refIndex.Bins;
301 LinearOffsetVector& offsets = refIndex.Offsets;
303 // store whether reference has alignments or no
304 references[i].RefHasAlignments = ( binMap.size() > 0 );
306 // sort linear offsets
307 sort(offsets.begin(), offsets.end());
310 // rewind file pointer to beginning of alignments, return success/fail
311 return reader->Rewind();
314 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
316 // calculate which bins overlap this region
317 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
318 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
320 // get bins for this reference
321 const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID);
322 const BamBinMap& binMap = refIndex.Bins;
324 // get minimum offset to consider
325 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
326 uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
328 // store all alignment 'chunk' starts (file offsets) for bins in this region
329 for ( int i = 0; i < numBins; ++i ) {
331 const uint16_t binKey = bins[i];
332 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
333 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
335 // iterate over chunks
336 const ChunkVector& chunks = (*binIter).second;
337 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
338 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
339 for ( ; chunksIter != chunksEnd; ++chunksIter) {
341 // if valid chunk found, store its file offset
342 const Chunk& chunk = (*chunksIter);
343 if ( chunk.Stop > minOffset )
344 offsets.push_back( chunk.Start );
352 // sort the offsets before returning
353 sort(offsets.begin(), offsets.end());
355 // return whether any offsets were found
356 return ( offsets.size() != 0 );
359 // saves BAM bin entry for index
360 void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap,
361 const uint32_t& saveBin,
362 const uint64_t& saveOffset,
363 const uint64_t& lastOffset)
366 BamBinMap::iterator binIter = binMap.find(saveBin);
369 Chunk newChunk(saveOffset, lastOffset);
371 // if entry doesn't exist
372 if ( binIter == binMap.end() ) {
373 ChunkVector newChunks;
374 newChunks.push_back(newChunk);
375 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
380 ChunkVector& binChunks = (*binIter).second;
381 binChunks.push_back( newChunk );
385 // saves linear offset entry for index
386 void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
387 const BamAlignment& bAlignment,
388 const uint64_t& lastOffset)
390 // get converted offsets
391 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
392 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
394 // resize vector if necessary
395 int oldSize = offsets.size();
396 int newSize = endOffset + 1;
397 if ( oldSize < newSize )
398 offsets.resize(newSize, 0);
401 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
402 if ( offsets[i] == 0 )
403 offsets[i] = lastOffset;
407 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) {
409 // localize parent data
410 if ( m_parent == 0 ) return false;
411 BamReader* reader = m_parent->m_reader;
412 BgzfData* mBGZF = m_parent->m_BGZF;
413 RefVector& references = m_parent->m_references;
415 // be sure reader & BGZF file are valid & open for reading
416 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
419 // see if left-bound reference of region has alignments
420 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
422 // make sure left-bound position is valid
423 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
425 vector<int64_t> offsets;
426 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
427 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
431 // iterate through offsets
432 BamAlignment bAlignment;
434 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
436 // attempt seek & load first available alignment
437 result &= mBGZF->Seek(*o);
438 reader->GetNextAlignmentCore(bAlignment);
440 // if this alignment corresponds to desired position
441 // return success of seeking back to 'current offset'
442 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
443 if ( o != offsets.begin() ) --o;
444 return mBGZF->Seek(*o);
448 if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
452 bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) {
454 // localize parent data
455 if ( m_parent == 0 ) return false;
456 const bool isBigEndian = m_parent->m_isBigEndian;
457 RefVector& references = m_parent->m_references;
459 // open index file, abort on error
460 FILE* indexStream = fopen(filename.c_str(), "rb");
462 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
466 // set placeholder to receive input byte count (suppresses compiler warnings)
467 size_t elementsRead = 0;
469 // see if index is valid BAM index
471 elementsRead = fread(magic, 1, 4, indexStream);
472 if ( strncmp(magic, "BAI\1", 4) ) {
473 fprintf(stderr, "Problem with index file - invalid format.\n");
478 // get number of reference sequences
480 elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
481 if ( isBigEndian ) SwapEndian_32(numRefSeqs);
483 // intialize space for BamStandardIndexData data structure
484 m_indexData.reserve(numRefSeqs);
486 // iterate over reference sequences
487 for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
489 // get number of bins for this reference sequence
491 elementsRead = fread(&numBins, 4, 1, indexStream);
492 if ( isBigEndian ) SwapEndian_32(numBins);
495 RefData& refEntry = references[i];
496 refEntry.RefHasAlignments = true;
499 // intialize BinVector
502 // iterate over bins for that reference sequence
503 for ( int j = 0; j < numBins; ++j ) {
507 elementsRead = fread(&binID, 4, 1, indexStream);
509 // get number of regionChunks in this bin
511 elementsRead = fread(&numChunks, 4, 1, indexStream);
514 SwapEndian_32(binID);
515 SwapEndian_32(numChunks);
518 // intialize ChunkVector
519 ChunkVector regionChunks;
520 regionChunks.reserve(numChunks);
522 // iterate over regionChunks in this bin
523 for ( unsigned int k = 0; k < numChunks; ++k ) {
525 // get chunk boundaries (left, right)
528 elementsRead = fread(&left, 8, 1, indexStream);
529 elementsRead = fread(&right, 8, 1, indexStream);
533 SwapEndian_64(right);
537 regionChunks.push_back( Chunk(left, right) );
540 // sort chunks for this bin
541 sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
543 // save binID, chunkVector for this bin
544 binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
547 // -----------------------------------------------------
548 // load linear index for this reference sequence
550 // get number of linear offsets
551 int32_t numLinearOffsets;
552 elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
553 if ( isBigEndian ) SwapEndian_32(numLinearOffsets);
555 // intialize LinearOffsetVector
556 LinearOffsetVector offsets;
557 offsets.reserve(numLinearOffsets);
559 // iterate over linear offsets for this reference sequeence
560 uint64_t linearOffset;
561 for ( int j = 0; j < numLinearOffsets; ++j ) {
562 // read a linear offset & store
563 elementsRead = fread(&linearOffset, 8, 1, indexStream);
564 if ( isBigEndian ) SwapEndian_64(linearOffset);
565 offsets.push_back(linearOffset);
568 // sort linear offsets
569 sort( offsets.begin(), offsets.end() );
571 // store index data for that reference sequence
572 m_indexData.push_back( ReferenceIndex(binMap, offsets) );
575 // close index file (.bai) and return
580 // merges 'alignment chunks' in BAM bin (used for index building)
581 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
583 // iterate over reference enties
584 BamStandardIndexData::iterator indexIter = m_indexData.begin();
585 BamStandardIndexData::iterator indexEnd = m_indexData.end();
586 for ( ; indexIter != indexEnd; ++indexIter ) {
588 // get BAM bin map for this reference
589 ReferenceIndex& refIndex = (*indexIter);
590 BamBinMap& bamBinMap = refIndex.Bins;
592 // iterate over BAM bins
593 BamBinMap::iterator binIter = bamBinMap.begin();
594 BamBinMap::iterator binEnd = bamBinMap.end();
595 for ( ; binIter != binEnd; ++binIter ) {
597 // get chunk vector for this bin
598 ChunkVector& binChunks = (*binIter).second;
599 if ( binChunks.size() == 0 ) continue;
601 ChunkVector mergedChunks;
602 mergedChunks.push_back( binChunks[0] );
604 // iterate over chunks
606 ChunkVector::iterator chunkIter = binChunks.begin();
607 ChunkVector::iterator chunkEnd = binChunks.end();
608 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
610 // get 'currentChunk' based on numeric index
611 Chunk& currentChunk = mergedChunks[i];
613 // get iteratorChunk based on vector iterator
614 Chunk& iteratorChunk = (*chunkIter);
616 // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
617 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
619 // set currentChunk.Stop to iteratorChunk.Stop
620 currentChunk.Stop = iteratorChunk.Stop;
625 // set currentChunk + 1 to iteratorChunk
626 mergedChunks.push_back(iteratorChunk);
631 // saved merged chunk vector
632 (*binIter).second = mergedChunks;
637 // writes in-memory index data out to file
638 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
639 bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) {
641 // localize parent data
642 if ( m_parent == 0 ) return false;
643 const bool isBigEndian = m_parent->m_isBigEndian;
645 string indexFilename = bamFilename + ".bai";
646 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
647 if ( indexStream == 0 ) {
648 fprintf(stderr, "ERROR: Could not open file to save index.\n");
652 // write BAM index header
653 fwrite("BAI\1", 1, 4, indexStream);
655 // write number of reference sequences
656 int32_t numReferenceSeqs = m_indexData.size();
657 if ( isBigEndian ) SwapEndian_32(numReferenceSeqs);
658 fwrite(&numReferenceSeqs, 4, 1, indexStream);
660 // iterate over reference sequences
661 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
662 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
663 for ( ; indexIter != indexEnd; ++ indexIter ) {
665 // get reference index data
666 const ReferenceIndex& refIndex = (*indexIter);
667 const BamBinMap& binMap = refIndex.Bins;
668 const LinearOffsetVector& offsets = refIndex.Offsets;
670 // write number of bins
671 int32_t binCount = binMap.size();
672 if ( isBigEndian ) SwapEndian_32(binCount);
673 fwrite(&binCount, 4, 1, indexStream);
676 BamBinMap::const_iterator binIter = binMap.begin();
677 BamBinMap::const_iterator binEnd = binMap.end();
678 for ( ; binIter != binEnd; ++binIter ) {
680 // get bin data (key and chunk vector)
681 uint32_t binKey = (*binIter).first;
682 const ChunkVector& binChunks = (*binIter).second;
685 if ( isBigEndian ) SwapEndian_32(binKey);
686 fwrite(&binKey, 4, 1, indexStream);
689 int32_t chunkCount = binChunks.size();
690 if ( isBigEndian ) SwapEndian_32(chunkCount);
691 fwrite(&chunkCount, 4, 1, indexStream);
693 // iterate over chunks
694 ChunkVector::const_iterator chunkIter = binChunks.begin();
695 ChunkVector::const_iterator chunkEnd = binChunks.end();
696 for ( ; chunkIter != chunkEnd; ++chunkIter ) {
698 // get current chunk data
699 const Chunk& chunk = (*chunkIter);
700 uint64_t start = chunk.Start;
701 uint64_t stop = chunk.Stop;
704 SwapEndian_64(start);
708 // save chunk offsets
709 fwrite(&start, 8, 1, indexStream);
710 fwrite(&stop, 8, 1, indexStream);
714 // write linear offsets size
715 int32_t offsetSize = offsets.size();
716 if ( isBigEndian ) SwapEndian_32(offsetSize);
717 fwrite(&offsetSize, 4, 1, indexStream);
719 // iterate over linear offsets
720 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
721 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
722 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
724 // write linear offset value
725 uint64_t linearOffset = (*offsetIter);
726 if ( isBigEndian ) SwapEndian_64(linearOffset);
727 fwrite(&linearOffset, 8, 1, indexStream);
731 // flush buffer, close file, and return success
737 // ---------------------------------------------------------------
738 // BamStandardIndex implementation
740 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
741 : BamIndex(bgzf, reader, isBigEndian)
743 d = new BamStandardIndexPrivate(this);
746 BamStandardIndex::~BamStandardIndex(void) {
751 // creates index data (in-memory) from current reader data
752 bool BamStandardIndex::Build(void) { return d->Build(); }
754 // attempts to use index to jump to region; returns success/fail
755 bool BamStandardIndex::Jump(const BamRegion& region) { return d->Jump(region); }
757 // loads existing data from file into memory
758 bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); }
760 // writes in-memory index data out to file
761 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
762 bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }
764 // #########################################################################################
765 // #########################################################################################
767 // ---------------------------------------------------
768 // BamToolsIndex structs & typedefs
772 // individual index entry
773 struct BamToolsIndexEntry {
776 int32_t MaxEndPosition;
778 int32_t StartPosition;
781 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
782 const int64_t& startOffset = 0,
783 const int32_t& startPosition = 0)
784 : MaxEndPosition(maxEndPosition)
785 , StartOffset(startOffset)
786 , StartPosition(startPosition)
790 // the actual index data structure
791 typedef map<int, vector<BamToolsIndexEntry> > BamToolsIndexData;
793 } // namespace BamTools
795 // ---------------------------------------------------
796 // BamToolsIndexPrivate implementation
798 struct BamToolsIndex::BamToolsIndexPrivate {
800 // keep a list of any supported versions here
801 // (might be useful later to handle any 'legacy' versions if the format changes)
802 // listed for example like: BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
804 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
806 // if ( indexVersion >= BTI_1_2 )
810 enum Version { BTI_1_0 = 1 };
812 // -------------------------
814 BamToolsIndexData m_indexData;
815 BamToolsIndex* m_parent;
819 // -------------------------
822 BamToolsIndexPrivate(BamToolsIndex* parent)
825 , m_version(BTI_1_0) // latest version - used for writing new index files
828 ~BamToolsIndexPrivate(void) { }
830 // -------------------------
833 // creates index data (in-memory) from current reader data
835 // returns supported file extension
836 const std::string Extension(void) const { return std::string(".bti"); }
837 // attempts to use index to jump to region; returns success/fail
838 bool Jump(const BamTools::BamRegion& region);
839 // loads existing data from file into memory
840 bool Load(const std::string& filename);
841 // writes in-memory index data out to file
842 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
843 bool Write(const std::string& bamFilename);
845 // -------------------------
848 // calculates offset for a given region
849 bool GetOffset(const BamRegion& region, int64_t& offset);
852 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
854 // localize parent data
855 if ( m_parent == 0 ) return false;
856 BamReader* reader = m_parent->m_reader;
857 BgzfData* mBGZF = m_parent->m_BGZF;
858 RefVector& references = m_parent->m_references;
860 // be sure reader & BGZF file are valid & open for reading
861 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
864 // move file pointer to beginning of alignments
866 if ( !reader->Rewind() )
869 // set up counters and markers
870 int32_t currentBlockCount = 0;
871 int64_t currentAlignmentOffset = mBGZF->Tell();
872 int32_t blockRefId = 0;
873 int32_t blockMaxEndPosition = 0;
874 int64_t blockStartOffset = currentAlignmentOffset;
875 int32_t blockStartPosition = -1;
877 // plow through alignments, storing index entries
879 while ( reader->GetNextAlignmentCore(al) ) {
881 // if block contains data (not the first time through) AND alignment is on a new reference
882 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
884 // make sure reference flag is set
885 references[blockRefId].RefHasAlignments = true;
887 // store previous data
888 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
890 // intialize new block
891 currentBlockCount = 0;
892 blockMaxEndPosition = al.GetEndPosition();
893 blockStartOffset = currentAlignmentOffset;
896 // if beginning of block, save first alignment's refID & position
897 if ( currentBlockCount == 0 ) {
898 blockRefId = al.RefID;
899 blockStartPosition = al.Position;
902 // increment block counter
905 // check end position
906 int32_t alignmentEndPosition = al.GetEndPosition();
907 if ( alignmentEndPosition > blockMaxEndPosition )
908 blockMaxEndPosition = alignmentEndPosition;
910 // if block is full, get offset for next block, reset currentBlockCount
911 if ( currentBlockCount == m_blockSize ) {
913 // make sure reference flag is set
914 references[blockRefId].RefHasAlignments = true;
916 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
917 blockStartOffset = mBGZF->Tell();
918 currentBlockCount = 0;
921 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
922 // necessary because we won't know if this next alignment is on a new reference until we actually read it
923 currentAlignmentOffset = mBGZF->Tell();
926 // attempt to rewind back to begnning of alignments
927 // return success/fail
928 return reader->Rewind();
931 // N.B. - ignores isRightBoundSpecified
932 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset) {
934 // return false if leftBound refID is not found in index data
935 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false;
937 const vector<BamToolsIndexEntry> referenceOffsets = m_indexData[region.LeftRefID];
938 if ( referenceOffsets.empty() ) return false;
940 // -------------------------------------------------------
941 // calculate nearest index to jump to
944 offset = (*referenceOffsets.begin()).StartOffset;
945 vector<BamToolsIndexEntry>::const_iterator indexIter = referenceOffsets.begin();
946 vector<BamToolsIndexEntry>::const_iterator indexEnd = referenceOffsets.end();
947 for ( ; indexIter != indexEnd; ++indexIter ) {
948 const BamToolsIndexEntry& entry = (*indexIter);
949 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
950 offset = (*indexIter).StartOffset;
954 if ( indexIter == indexEnd ) return false;
960 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) {
962 // localize parent data
963 if ( m_parent == 0 ) return false;
964 BamReader* reader = m_parent->m_reader;
965 BgzfData* mBGZF = m_parent->m_BGZF;
966 RefVector& references = m_parent->m_references;
968 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
969 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
973 // see if left-bound reference of region has alignments
974 if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
976 // make sure left-bound position is valid
977 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
979 // calculate nearest offset to jump to
981 if ( !GetOffset(region, offset) ) {
982 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
986 // attempt seek in file, return success/fail
987 return mBGZF->Seek(offset);
990 bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) {
992 // localize parent data
993 if ( m_parent == 0 ) return false;
994 const bool isBigEndian = m_parent->m_isBigEndian;
995 RefVector& references = m_parent->m_references;
997 // open index file, abort on error
998 FILE* indexStream = fopen(filename.c_str(), "rb");
1000 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
1004 // set placeholder to receive input byte count (suppresses compiler warnings)
1005 size_t elementsRead = 0;
1007 // see if index is valid BAM index
1009 elementsRead = fread(magic, 1, 4, indexStream);
1010 if ( strncmp(magic, "BTI\1", 4) ) {
1011 fprintf(stderr, "Problem with index file - invalid format.\n");
1012 fclose(indexStream);
1017 int32_t indexVersion;
1018 elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream);
1019 if ( isBigEndian ) SwapEndian_32(indexVersion);
1020 if ( indexVersion <= 0 || indexVersion > m_version ) {
1021 fprintf(stderr, "Problem with index file - unsupported version.\n");
1022 fclose(indexStream);
1026 // read in block size
1027 elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream);
1028 if ( isBigEndian ) SwapEndian_32(m_blockSize);
1030 // read in number of references
1031 int32_t numReferences;
1032 elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream);
1033 if ( isBigEndian ) SwapEndian_32(numReferences);
1035 // iterate over reference entries
1036 for ( int i = 0; i < numReferences; ++i ) {
1038 // read in number of offsets for this reference
1039 uint32_t numOffsets;
1040 elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
1041 if ( isBigEndian ) SwapEndian_32(numOffsets);
1043 // initialize offsets container for this reference
1044 vector<BamToolsIndexEntry> offsets;
1045 offsets.reserve(numOffsets);
1047 // iterate over offset entries
1048 for ( unsigned int j = 0; j < numOffsets; ++j ) {
1051 int32_t maxEndPosition;
1052 int64_t startOffset;
1053 int32_t startPosition;
1056 elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1057 elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream);
1058 elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream);
1060 // swap endian-ness if necessary
1061 if ( isBigEndian ) {
1062 SwapEndian_32(maxEndPosition);
1063 SwapEndian_64(startOffset);
1064 SwapEndian_32(startPosition);
1067 // save current index entry
1068 offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) );
1071 // save refID, offsetVector entry into data structure
1072 m_indexData.insert( make_pair(i, offsets) );
1074 // set ref.HasAlignments flag
1075 references[i].RefHasAlignments = (numOffsets != 0);
1078 // close index file and return
1079 fclose(indexStream);
1083 // writes in-memory index data out to file
1084 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1085 bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) {
1087 // localize parent data
1088 if ( m_parent == 0 ) return false;
1089 const bool isBigEndian = m_parent->m_isBigEndian;
1091 // open index file for writing
1092 string indexFilename = bamFilename + ".bti";
1093 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
1094 if ( indexStream == 0 ) {
1095 fprintf(stderr, "ERROR: Could not open file to save index.\n");
1099 // write BTI index format 'magic number'
1100 fwrite("BTI\1", 1, 4, indexStream);
1102 // write BTI index format version
1103 int32_t currentVersion = (int32_t)m_version;
1104 if ( isBigEndian ) SwapEndian_32(currentVersion);
1105 fwrite(¤tVersion, sizeof(int32_t), 1, indexStream);
1108 int32_t blockSize = m_blockSize;
1109 if ( isBigEndian ) SwapEndian_32(blockSize);
1110 fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
1112 // write number of references
1113 int32_t numReferences = (int32_t)m_indexData.size();
1114 if ( isBigEndian ) SwapEndian_32(numReferences);
1115 fwrite(&numReferences, sizeof(numReferences), 1, indexStream);
1117 // iterate through references in index
1118 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1119 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1120 for ( ; refIter != refEnd; ++refIter ) {
1122 const vector<BamToolsIndexEntry> offsets = (*refIter).second;
1124 // write number of offsets listed for this reference
1125 uint32_t numOffsets = offsets.size();
1126 if ( isBigEndian ) SwapEndian_32(numOffsets);
1127 fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
1129 // iterate over offset entries
1130 vector<BamToolsIndexEntry>::const_iterator offsetIter = offsets.begin();
1131 vector<BamToolsIndexEntry>::const_iterator offsetEnd = offsets.end();
1132 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1134 // get reference index data
1135 const BamToolsIndexEntry& entry = (*offsetIter);
1138 int32_t maxEndPosition = entry.MaxEndPosition;
1139 int64_t startOffset = entry.StartOffset;
1140 int32_t startPosition = entry.StartPosition;
1142 // swap endian-ness if necessary
1143 if ( isBigEndian ) {
1144 SwapEndian_32(maxEndPosition);
1145 SwapEndian_64(startOffset);
1146 SwapEndian_32(startPosition);
1149 // write the reference index entry
1150 fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1151 fwrite(&startOffset, sizeof(startOffset), 1, indexStream);
1152 fwrite(&startPosition, sizeof(startPosition), 1, indexStream);
1156 // flush any remaining output, close file, and return success
1157 fflush(indexStream);
1158 fclose(indexStream);
1162 // ---------------------------------------------------
1163 // BamToolsIndex implementation
1165 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
1166 : BamIndex(bgzf, reader, isBigEndian)
1168 d = new BamToolsIndexPrivate(this);
1171 BamToolsIndex::~BamToolsIndex(void) {
1176 // creates index data (in-memory) from current reader data
1177 bool BamToolsIndex::Build(void) { return d->Build(); }
1179 // attempts to use index to jump to region; returns success/fail
1180 bool BamToolsIndex::Jump(const BamRegion& region) { return d->Jump(region); }
1182 // loads existing data from file into memory
1183 bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); }
1185 // writes in-memory index data out to file
1186 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1187 bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }