1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 9 October 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
27 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
30 , m_isBigEndian(isBigEndian)
32 if ( m_reader && m_reader->IsOpen() )
33 m_references = m_reader->GetReferenceData();
36 // #########################################################################################
37 // #########################################################################################
39 // -------------------------------
40 // BamStandardIndex structs & typedefs
44 // BAM index constants
45 const int MAX_BIN = 37450; // =(8^6-1)/7+1
46 const int BAM_LIDX_SHIFT = 14;
48 // --------------------------------------------------
49 // BamStandardIndex data structures & typedefs
57 Chunk(const uint64_t& start = 0,
58 const uint64_t& stop = 0)
64 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
65 return lhs.Start < rhs.Start;
68 typedef vector<Chunk> ChunkVector;
69 typedef map<uint32_t, ChunkVector> BamBinMap;
70 typedef vector<uint64_t> LinearOffsetVector;
72 struct ReferenceIndex {
76 LinearOffsetVector Offsets;
79 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
80 const LinearOffsetVector& offsets = LinearOffsetVector())
86 typedef vector<ReferenceIndex> BamStandardIndexData;
88 } // namespace BamTools
90 // -------------------------------
91 // BamStandardIndexPrivate implementation
93 struct BamStandardIndex::BamStandardIndexPrivate {
95 // -------------------------
98 BamStandardIndexData m_indexData;
99 BamStandardIndex* m_parent;
101 // -------------------------
104 BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { }
105 ~BamStandardIndexPrivate(void) { m_indexData.clear(); }
107 // -------------------------
110 // creates index data (in-memory) from current reader data
112 // returns whether reference has alignments or no
113 bool HasAlignments(const int& referenceID) const;
114 // attempts to use index to jump to region; returns success/fail
115 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
116 // loads existing data from file into memory
117 bool Load(const std::string& filename);
118 // writes in-memory index data out to file
119 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
120 bool Write(const std::string& bamFilename);
122 // -------------------------
125 // calculate bins that overlap region
126 int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
127 // calculates offset(s) for a given region
128 bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets, bool* hasAlignmentsInRegion);
129 // saves BAM bin entry for index
130 void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
131 // saves linear offset entry for index
132 void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
133 // simplifies index by merging 'chunks'
134 void MergeChunks(void);
137 // calculate bins that overlap region
138 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
140 // get region boundaries
141 uint32_t begin = (unsigned int)region.LeftPosition;
144 // if right bound specified AND left&right bounds are on same reference
145 // OK to use right bound position
146 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
147 end = (unsigned int)region.RightPosition;
149 // otherwise, use end of left bound reference as cutoff
151 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
153 // initialize list, bin '0' always a valid bin
157 // get rest of bins that contain this region
159 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
160 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
161 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
162 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
163 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
165 // return number of bins stored
169 // creates index data (in-memory) from current reader data
170 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
172 // localize parent data
173 if ( m_parent == 0 ) return false;
174 BamReader* reader = m_parent->m_reader;
175 BgzfData* mBGZF = m_parent->m_BGZF;
177 // be sure reader & BGZF file are valid & open for reading
178 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
181 // move file pointer to beginning of alignments
184 // get reference count, reserve index space
185 const int numReferences = (int)m_parent->m_references.size();
186 for ( int i = 0; i < numReferences; ++i )
187 m_indexData.push_back(ReferenceIndex());
189 // sets default constant for bin, ID, offset, coordinate variables
190 const uint32_t defaultValue = 0xffffffffu;
193 uint32_t saveBin(defaultValue);
194 uint32_t lastBin(defaultValue);
197 int32_t saveRefID(defaultValue);
198 int32_t lastRefID(defaultValue);
201 uint64_t saveOffset = mBGZF->Tell();
202 uint64_t lastOffset = saveOffset;
205 int32_t lastCoordinate = defaultValue;
207 BamAlignment bAlignment;
208 while ( reader->GetNextAlignmentCore(bAlignment) ) {
210 // change of chromosome, save ID, reset bin
211 if ( lastRefID != bAlignment.RefID ) {
212 lastRefID = bAlignment.RefID;
213 lastBin = defaultValue;
216 // if lastCoordinate greater than BAM position - file not sorted properly
217 else if ( lastCoordinate > bAlignment.Position ) {
218 fprintf(stderr, "BAM file not properly sorted:\n");
219 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
223 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
224 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
226 // save linear offset entry (matched to BAM entry refID)
227 ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
228 LinearOffsetVector& offsets = refIndex.Offsets;
229 InsertLinearOffset(offsets, bAlignment, lastOffset);
232 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
233 if ( bAlignment.Bin != lastBin ) {
235 // if not first time through
236 if ( saveBin != defaultValue ) {
238 // save Bam bin entry
239 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
240 BamBinMap& binMap = refIndex.Bins;
241 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
245 saveOffset = lastOffset;
248 saveBin = bAlignment.Bin;
249 lastBin = bAlignment.Bin;
252 saveRefID = bAlignment.RefID;
254 // if invalid RefID, break out
255 if ( saveRefID < 0 ) break;
258 // make sure that current file pointer is beyond lastOffset
259 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
260 fprintf(stderr, "Error in BGZF offsets.\n");
265 lastOffset = mBGZF->Tell();
267 // update lastCoordinate
268 lastCoordinate = bAlignment.Position;
271 // save any leftover BAM data (as long as refID is valid)
272 if ( saveRefID >= 0 ) {
273 // save Bam bin entry
274 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
275 BamBinMap& binMap = refIndex.Bins;
276 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
279 // simplify index by merging chunks
282 // iterate through references in index
283 // sort offsets in linear offset vector
284 BamStandardIndexData::iterator indexIter = m_indexData.begin();
285 BamStandardIndexData::iterator indexEnd = m_indexData.end();
286 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
288 // get reference index data
289 ReferenceIndex& refIndex = (*indexIter);
290 LinearOffsetVector& offsets = refIndex.Offsets;
292 // sort linear offsets
293 sort(offsets.begin(), offsets.end());
296 // rewind file pointer to beginning of alignments, return success/fail
297 return reader->Rewind();
300 // calculates offset(s) for a given region
301 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets, bool* hasAlignmentsInRegion) {
303 // calculate which bins overlap this region
304 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
305 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
307 // get bins for this reference
308 const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID);
309 const BamBinMap& binMap = refIndex.Bins;
311 // get minimum offset to consider
312 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
313 uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
315 // store all alignment 'chunk' starts (file offsets) for bins in this region
316 for ( int i = 0; i < numBins; ++i ) {
318 const uint16_t binKey = bins[i];
319 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
320 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
322 // iterate over chunks
323 const ChunkVector& chunks = (*binIter).second;
324 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
325 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
326 for ( ; chunksIter != chunksEnd; ++chunksIter) {
328 // if valid chunk found, store its file offset
329 const Chunk& chunk = (*chunksIter);
330 if ( chunk.Stop > minOffset )
331 offsets.push_back( chunk.Start );
339 // sort the offsets before returning
340 sort(offsets.begin(), offsets.end());
342 // set flag & return success
343 *hasAlignmentsInRegion = (offsets.size() != 0 );
347 // returns whether reference has alignments or no
348 bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& referenceID) const {
351 if ( referenceID < 0 || referenceID >= (int)m_indexData.size() )
354 // return whether reference contains data
355 const ReferenceIndex& refIndex = m_indexData.at(referenceID);
356 return !( refIndex.Bins.empty() );
359 // saves BAM bin entry for index
360 void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap,
361 const uint32_t& saveBin,
362 const uint64_t& saveOffset,
363 const uint64_t& lastOffset)
366 BamBinMap::iterator binIter = binMap.find(saveBin);
369 Chunk newChunk(saveOffset, lastOffset);
371 // if entry doesn't exist
372 if ( binIter == binMap.end() ) {
373 ChunkVector newChunks;
374 newChunks.push_back(newChunk);
375 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
380 ChunkVector& binChunks = (*binIter).second;
381 binChunks.push_back( newChunk );
385 // saves linear offset entry for index
386 void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
387 const BamAlignment& bAlignment,
388 const uint64_t& lastOffset)
390 // get converted offsets
391 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
392 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
394 // resize vector if necessary
395 int oldSize = offsets.size();
396 int newSize = endOffset + 1;
397 if ( oldSize < newSize )
398 offsets.resize(newSize, 0);
401 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
402 if ( offsets[i] == 0 )
403 offsets[i] = lastOffset;
407 // attempts to use index to jump to region; returns success/fail
408 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
410 // localize parent data
411 if ( m_parent == 0 ) return false;
412 BamReader* reader = m_parent->m_reader;
413 BgzfData* mBGZF = m_parent->m_BGZF;
414 RefVector& references = m_parent->m_references;
416 // be sure reader & BGZF file are valid & open for reading
417 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
420 // make sure left-bound position is valid
421 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
423 // calculate offsets for this region
424 // if failed, print message, set flag, and return failure
425 vector<int64_t> offsets;
426 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
427 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
428 *hasAlignmentsInRegion = false;
432 // iterate through offsets
433 BamAlignment bAlignment;
435 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
437 // attempt seek & load first available alignment
438 // set flag to true if data exists
439 result &= mBGZF->Seek(*o);
440 *hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment);
442 // if this alignment corresponds to desired position
443 // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
444 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
445 if ( o != offsets.begin() ) --o;
446 return mBGZF->Seek(*o);
450 // if error in jumping, print message & set flag
452 fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
453 *hasAlignmentsInRegion = false;
456 // return success/failure
460 // loads existing data from file into memory
461 bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) {
463 // localize parent data
464 if ( m_parent == 0 ) return false;
465 const bool isBigEndian = m_parent->m_isBigEndian;
467 // open index file, abort on error
468 FILE* indexStream = fopen(filename.c_str(), "rb");
470 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
474 // set placeholder to receive input byte count (suppresses compiler warnings)
475 size_t elementsRead = 0;
477 // see if index is valid BAM index
479 elementsRead = fread(magic, 1, 4, indexStream);
480 if ( strncmp(magic, "BAI\1", 4) ) {
481 fprintf(stderr, "Problem with index file - invalid format.\n");
486 // get number of reference sequences
488 elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
489 if ( isBigEndian ) SwapEndian_32(numRefSeqs);
491 // intialize space for BamStandardIndexData data structure
492 m_indexData.reserve(numRefSeqs);
494 // iterate over reference sequences
495 for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
497 // get number of bins for this reference sequence
499 elementsRead = fread(&numBins, 4, 1, indexStream);
500 if ( isBigEndian ) SwapEndian_32(numBins);
502 // intialize BinVector
505 // iterate over bins for that reference sequence
506 for ( int j = 0; j < numBins; ++j ) {
510 elementsRead = fread(&binID, 4, 1, indexStream);
512 // get number of regionChunks in this bin
514 elementsRead = fread(&numChunks, 4, 1, indexStream);
517 SwapEndian_32(binID);
518 SwapEndian_32(numChunks);
521 // intialize ChunkVector
522 ChunkVector regionChunks;
523 regionChunks.reserve(numChunks);
525 // iterate over regionChunks in this bin
526 for ( unsigned int k = 0; k < numChunks; ++k ) {
528 // get chunk boundaries (left, right)
531 elementsRead = fread(&left, 8, 1, indexStream);
532 elementsRead = fread(&right, 8, 1, indexStream);
536 SwapEndian_64(right);
540 regionChunks.push_back( Chunk(left, right) );
543 // sort chunks for this bin
544 sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
546 // save binID, chunkVector for this bin
547 binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
550 // -----------------------------------------------------
551 // load linear index for this reference sequence
553 // get number of linear offsets
554 int32_t numLinearOffsets;
555 elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
556 if ( isBigEndian ) SwapEndian_32(numLinearOffsets);
558 // intialize LinearOffsetVector
559 LinearOffsetVector offsets;
560 offsets.reserve(numLinearOffsets);
562 // iterate over linear offsets for this reference sequeence
563 uint64_t linearOffset;
564 for ( int j = 0; j < numLinearOffsets; ++j ) {
565 // read a linear offset & store
566 elementsRead = fread(&linearOffset, 8, 1, indexStream);
567 if ( isBigEndian ) SwapEndian_64(linearOffset);
568 offsets.push_back(linearOffset);
571 // sort linear offsets
572 sort( offsets.begin(), offsets.end() );
574 // store index data for that reference sequence
575 m_indexData.push_back( ReferenceIndex(binMap, offsets) );
578 // close index file (.bai) and return
583 // merges 'alignment chunks' in BAM bin (used for index building)
584 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
586 // iterate over reference enties
587 BamStandardIndexData::iterator indexIter = m_indexData.begin();
588 BamStandardIndexData::iterator indexEnd = m_indexData.end();
589 for ( ; indexIter != indexEnd; ++indexIter ) {
591 // get BAM bin map for this reference
592 ReferenceIndex& refIndex = (*indexIter);
593 BamBinMap& bamBinMap = refIndex.Bins;
595 // iterate over BAM bins
596 BamBinMap::iterator binIter = bamBinMap.begin();
597 BamBinMap::iterator binEnd = bamBinMap.end();
598 for ( ; binIter != binEnd; ++binIter ) {
600 // get chunk vector for this bin
601 ChunkVector& binChunks = (*binIter).second;
602 if ( binChunks.size() == 0 ) continue;
604 ChunkVector mergedChunks;
605 mergedChunks.push_back( binChunks[0] );
607 // iterate over chunks
609 ChunkVector::iterator chunkIter = binChunks.begin();
610 ChunkVector::iterator chunkEnd = binChunks.end();
611 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
613 // get 'currentChunk' based on numeric index
614 Chunk& currentChunk = mergedChunks[i];
616 // get iteratorChunk based on vector iterator
617 Chunk& iteratorChunk = (*chunkIter);
619 // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
620 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
622 // set currentChunk.Stop to iteratorChunk.Stop
623 currentChunk.Stop = iteratorChunk.Stop;
628 // set currentChunk + 1 to iteratorChunk
629 mergedChunks.push_back(iteratorChunk);
634 // saved merged chunk vector
635 (*binIter).second = mergedChunks;
640 // writes in-memory index data out to file
641 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
642 bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) {
644 // localize parent data
645 if ( m_parent == 0 ) return false;
646 const bool isBigEndian = m_parent->m_isBigEndian;
649 string indexFilename = bamFilename + ".bai";
650 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
651 if ( indexStream == 0 ) {
652 fprintf(stderr, "ERROR: Could not open file to save index.\n");
656 // write BAM index header
657 fwrite("BAI\1", 1, 4, indexStream);
659 // write number of reference sequences
660 int32_t numReferenceSeqs = m_indexData.size();
661 if ( isBigEndian ) SwapEndian_32(numReferenceSeqs);
662 fwrite(&numReferenceSeqs, 4, 1, indexStream);
664 // iterate over reference sequences
665 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
666 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
667 for ( ; indexIter != indexEnd; ++ indexIter ) {
669 // get reference index data
670 const ReferenceIndex& refIndex = (*indexIter);
671 const BamBinMap& binMap = refIndex.Bins;
672 const LinearOffsetVector& offsets = refIndex.Offsets;
674 // write number of bins
675 int32_t binCount = binMap.size();
676 if ( isBigEndian ) SwapEndian_32(binCount);
677 fwrite(&binCount, 4, 1, indexStream);
680 BamBinMap::const_iterator binIter = binMap.begin();
681 BamBinMap::const_iterator binEnd = binMap.end();
682 for ( ; binIter != binEnd; ++binIter ) {
684 // get bin data (key and chunk vector)
685 uint32_t binKey = (*binIter).first;
686 const ChunkVector& binChunks = (*binIter).second;
689 if ( isBigEndian ) SwapEndian_32(binKey);
690 fwrite(&binKey, 4, 1, indexStream);
693 int32_t chunkCount = binChunks.size();
694 if ( isBigEndian ) SwapEndian_32(chunkCount);
695 fwrite(&chunkCount, 4, 1, indexStream);
697 // iterate over chunks
698 ChunkVector::const_iterator chunkIter = binChunks.begin();
699 ChunkVector::const_iterator chunkEnd = binChunks.end();
700 for ( ; chunkIter != chunkEnd; ++chunkIter ) {
702 // get current chunk data
703 const Chunk& chunk = (*chunkIter);
704 uint64_t start = chunk.Start;
705 uint64_t stop = chunk.Stop;
708 SwapEndian_64(start);
712 // save chunk offsets
713 fwrite(&start, 8, 1, indexStream);
714 fwrite(&stop, 8, 1, indexStream);
718 // write linear offsets size
719 int32_t offsetSize = offsets.size();
720 if ( isBigEndian ) SwapEndian_32(offsetSize);
721 fwrite(&offsetSize, 4, 1, indexStream);
723 // iterate over linear offsets
724 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
725 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
726 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
728 // write linear offset value
729 uint64_t linearOffset = (*offsetIter);
730 if ( isBigEndian ) SwapEndian_64(linearOffset);
731 fwrite(&linearOffset, 8, 1, indexStream);
735 // flush buffer, close file, and return success
741 // ---------------------------------------------------------------
742 // BamStandardIndex implementation
744 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
745 : BamIndex(bgzf, reader, isBigEndian)
747 d = new BamStandardIndexPrivate(this);
750 BamStandardIndex::~BamStandardIndex(void) {
755 // creates index data (in-memory) from current reader data
756 bool BamStandardIndex::Build(void) { return d->Build(); }
758 // returns whether reference has alignments or no
759 bool BamStandardIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
761 // attempts to use index to jump to region; returns success/fail
762 bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
764 // loads existing data from file into memory
765 bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); }
767 // writes in-memory index data out to file
768 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
769 bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }
771 // #########################################################################################
772 // #########################################################################################
774 // ---------------------------------------------------
775 // BamToolsIndex structs & typedefs
779 // individual index entry
780 struct BamToolsIndexEntry {
783 int32_t MaxEndPosition;
785 int32_t StartPosition;
788 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
789 const int64_t& startOffset = 0,
790 const int32_t& startPosition = 0)
791 : MaxEndPosition(maxEndPosition)
792 , StartOffset(startOffset)
793 , StartPosition(startPosition)
797 // the actual index data structure
798 typedef map<int, vector<BamToolsIndexEntry> > BamToolsIndexData;
800 } // namespace BamTools
802 // ---------------------------------------------------
803 // BamToolsIndexPrivate implementation
805 struct BamToolsIndex::BamToolsIndexPrivate {
807 // keep a list of any supported versions here
808 // (might be useful later to handle any 'legacy' versions if the format changes)
809 // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
811 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
813 // if ( indexVersion >= BTI_1_2 )
818 enum Version { BTI_1_0 = 1
823 // -------------------------
826 BamToolsIndexData m_indexData;
827 BamToolsIndex* m_parent;
831 // -------------------------
834 BamToolsIndexPrivate(BamToolsIndex* parent)
837 , m_version(BTI_1_2) // latest version - used for writing new index files
840 ~BamToolsIndexPrivate(void) { }
842 // -------------------------
845 // creates index data (in-memory) from current reader data
847 // returns supported file extension
848 const string Extension(void) const { return string(".bti"); }
849 // returns whether reference has alignments or no
850 bool HasAlignments(const int& referenceID) const;
851 // attempts to use index to jump to region; returns success/fail
852 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
853 // loads existing data from file into memory
854 bool Load(const std::string& filename);
855 // writes in-memory index data out to file
856 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
857 bool Write(const std::string& bamFilename);
859 // -------------------------
862 // calculates offset for a given region
863 bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
866 // creates index data (in-memory) from current reader data
867 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
869 // localize parent data
870 if ( m_parent == 0 ) return false;
871 BamReader* reader = m_parent->m_reader;
872 BgzfData* mBGZF = m_parent->m_BGZF;
874 // be sure reader & BGZF file are valid & open for reading
875 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
878 // move file pointer to beginning of alignments
880 if ( !reader->Rewind() )
883 // initialize index data structure with space for all references
884 const int numReferences = (int)m_parent->m_references.size();
885 for ( int i = 0; i < numReferences; ++i )
886 m_indexData[i].clear();
888 // set up counters and markers
889 int32_t currentBlockCount = 0;
890 int64_t currentAlignmentOffset = mBGZF->Tell();
891 int32_t blockRefId = 0;
892 int32_t blockMaxEndPosition = 0;
893 int64_t blockStartOffset = currentAlignmentOffset;
894 int32_t blockStartPosition = -1;
896 // plow through alignments, storing index entries
898 while ( reader->GetNextAlignmentCore(al) ) {
900 // if block contains data (not the first time through) AND alignment is on a new reference
901 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
903 // store previous data
904 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
906 // intialize new block for current alignment's reference
907 currentBlockCount = 0;
908 blockMaxEndPosition = al.GetEndPosition();
909 blockStartOffset = currentAlignmentOffset;
912 // if beginning of block, save first alignment's refID & position
913 if ( currentBlockCount == 0 ) {
914 blockRefId = al.RefID;
915 blockStartPosition = al.Position;
918 // increment block counter
921 // check end position
922 int32_t alignmentEndPosition = al.GetEndPosition();
923 if ( alignmentEndPosition > blockMaxEndPosition )
924 blockMaxEndPosition = alignmentEndPosition;
926 // if block is full, get offset for next block, reset currentBlockCount
927 if ( currentBlockCount == m_blockSize ) {
928 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
929 blockStartOffset = mBGZF->Tell();
930 currentBlockCount = 0;
933 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
934 // necessary because we won't know if this next alignment is on a new reference until we actually read it
935 currentAlignmentOffset = mBGZF->Tell();
938 // store final block with data
939 m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
941 // attempt to rewind back to begnning of alignments
942 // return success/fail
943 return reader->Rewind();
946 // N.B. - ignores isRightBoundSpecified
947 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
949 // return false if leftBound refID is not found in index data
950 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false;
952 const vector<BamToolsIndexEntry> referenceOffsets = m_indexData[region.LeftRefID];
953 if ( referenceOffsets.empty() ) return false;
955 // -------------------------------------------------------
956 // calculate nearest index to jump to
959 offset = (*referenceOffsets.begin()).StartOffset;
961 // iterate over offsets entries on this reference
962 vector<BamToolsIndexEntry>::const_iterator indexIter = referenceOffsets.begin();
963 vector<BamToolsIndexEntry>::const_iterator indexEnd = referenceOffsets.end();
964 for ( ; indexIter != indexEnd; ++indexIter ) {
965 const BamToolsIndexEntry& entry = (*indexIter);
966 // break if alignment 'entry' overlaps region
967 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
968 offset = (*indexIter).StartOffset;
971 // set flag based on whether an index entry was found for this region
972 *hasAlignmentsInRegion = ( indexIter != indexEnd );
976 // returns whether reference has alignments or no
977 bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& referenceID) const {
978 if ( m_indexData.find(referenceID) == m_indexData.end() )
981 return ( !m_indexData.at(referenceID).empty() );
984 // attempts to use index to jump to region; returns success/fail
985 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
988 *hasAlignmentsInRegion = false;
990 // localize parent data
991 if ( m_parent == 0 ) return false;
992 BamReader* reader = m_parent->m_reader;
993 BgzfData* mBGZF = m_parent->m_BGZF;
994 RefVector& references = m_parent->m_references;
996 // check valid BamReader state
997 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
998 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
1002 // make sure left-bound position is valid
1003 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
1005 // calculate nearest offset to jump to
1007 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
1008 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
1012 // attempt seek in file, return success/fail
1013 return mBGZF->Seek(offset);
1016 // loads existing data from file into memory
1017 bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) {
1019 // localize parent data
1020 if ( m_parent == 0 ) return false;
1021 const bool isBigEndian = m_parent->m_isBigEndian;
1023 // open index file, abort on error
1024 FILE* indexStream = fopen(filename.c_str(), "rb");
1025 if( !indexStream ) {
1026 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
1030 // set placeholder to receive input byte count (suppresses compiler warnings)
1031 size_t elementsRead = 0;
1033 // see if index is valid BAM index
1035 elementsRead = fread(magic, 1, 4, indexStream);
1036 if ( strncmp(magic, "BTI\1", 4) ) {
1037 fprintf(stderr, "Problem with index file - invalid format.\n");
1038 fclose(indexStream);
1043 int32_t indexVersion;
1044 elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream);
1045 if ( isBigEndian ) SwapEndian_32(indexVersion);
1046 if ( indexVersion <= 0 || indexVersion > m_version ) {
1047 fprintf(stderr, "Problem with index file - unsupported version.\n");
1048 fclose(indexStream);
1052 if ( (Version)indexVersion == BTI_1_0 ) {
1053 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
1054 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1055 fclose(indexStream);
1059 if ( (Version)indexVersion == BTI_1_1 ) {
1060 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
1061 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1062 fclose(indexStream);
1066 // read in block size
1067 elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream);
1068 if ( isBigEndian ) SwapEndian_32(m_blockSize);
1070 // read in number of references
1071 int32_t numReferences;
1072 elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream);
1073 if ( isBigEndian ) SwapEndian_32(numReferences);
1075 // iterate over reference entries
1076 for ( int i = 0; i < numReferences; ++i ) {
1078 // read in number of offsets for this reference
1079 uint32_t numOffsets;
1080 elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
1081 if ( isBigEndian ) SwapEndian_32(numOffsets);
1083 // initialize offsets container for this reference
1084 vector<BamToolsIndexEntry> offsets;
1085 offsets.reserve(numOffsets);
1087 // iterate over offset entries
1088 for ( unsigned int j = 0; j < numOffsets; ++j ) {
1091 int32_t maxEndPosition;
1092 int64_t startOffset;
1093 int32_t startPosition;
1096 elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1097 elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream);
1098 elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream);
1100 // swap endian-ness if necessary
1101 if ( isBigEndian ) {
1102 SwapEndian_32(maxEndPosition);
1103 SwapEndian_64(startOffset);
1104 SwapEndian_32(startPosition);
1107 // save current index entry
1108 offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) );
1111 // save refID, offsetVector entry into data structure
1112 m_indexData.insert( make_pair(i, offsets) );
1115 // close index file and return
1116 fclose(indexStream);
1120 // writes in-memory index data out to file
1121 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1122 bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) {
1124 // localize parent data
1125 if ( m_parent == 0 ) return false;
1126 const bool isBigEndian = m_parent->m_isBigEndian;
1128 // open index file for writing
1129 string indexFilename = bamFilename + ".bti";
1130 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
1131 if ( indexStream == 0 ) {
1132 fprintf(stderr, "ERROR: Could not open file to save index.\n");
1136 // write BTI index format 'magic number'
1137 fwrite("BTI\1", 1, 4, indexStream);
1139 // write BTI index format version
1140 int32_t currentVersion = (int32_t)m_version;
1141 if ( isBigEndian ) SwapEndian_32(currentVersion);
1142 fwrite(¤tVersion, sizeof(int32_t), 1, indexStream);
1145 int32_t blockSize = m_blockSize;
1146 if ( isBigEndian ) SwapEndian_32(blockSize);
1147 fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
1149 // write number of references
1150 int32_t numReferences = (int32_t)m_indexData.size();
1151 if ( isBigEndian ) SwapEndian_32(numReferences);
1152 fwrite(&numReferences, sizeof(numReferences), 1, indexStream);
1154 // iterate through references in index
1155 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1156 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1157 for ( ; refIter != refEnd; ++refIter ) {
1159 const vector<BamToolsIndexEntry> offsets = (*refIter).second;
1161 // write number of offsets listed for this reference
1162 uint32_t numOffsets = offsets.size();
1163 if ( isBigEndian ) SwapEndian_32(numOffsets);
1164 fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
1166 // iterate over offset entries
1167 vector<BamToolsIndexEntry>::const_iterator offsetIter = offsets.begin();
1168 vector<BamToolsIndexEntry>::const_iterator offsetEnd = offsets.end();
1169 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1171 // get reference index data
1172 const BamToolsIndexEntry& entry = (*offsetIter);
1175 int32_t maxEndPosition = entry.MaxEndPosition;
1176 int64_t startOffset = entry.StartOffset;
1177 int32_t startPosition = entry.StartPosition;
1179 // swap endian-ness if necessary
1180 if ( isBigEndian ) {
1181 SwapEndian_32(maxEndPosition);
1182 SwapEndian_64(startOffset);
1183 SwapEndian_32(startPosition);
1186 // write the reference index entry
1187 fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream);
1188 fwrite(&startOffset, sizeof(startOffset), 1, indexStream);
1189 fwrite(&startPosition, sizeof(startPosition), 1, indexStream);
1193 // flush any remaining output, close file, and return success
1194 fflush(indexStream);
1195 fclose(indexStream);
1199 // ---------------------------------------------------
1200 // BamToolsIndex implementation
1202 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
1203 : BamIndex(bgzf, reader, isBigEndian)
1205 d = new BamToolsIndexPrivate(this);
1208 BamToolsIndex::~BamToolsIndex(void) {
1213 // creates index data (in-memory) from current reader data
1214 bool BamToolsIndex::Build(void) { return d->Build(); }
1216 // returns whether reference has alignments or no
1217 bool BamToolsIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
1219 // attempts to use index to jump to region; returns success/fail
1220 // a "successful" jump indicates no error, but not whether this region has data
1221 // * thus, the method sets a flag to indicate whether there are alignments
1222 // available after the jump position
1223 bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
1225 // loads existing data from file into memory
1226 bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); }
1228 // writes in-memory index data out to file
1229 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1230 bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }