1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 19 November 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
13 #include <api/BamIndex.h>
14 #include <api/BamReader.h>
16 using namespace BamTools;
25 // -------------------------------
26 // BamIndex implementation
29 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader)
32 , m_cacheMode(BamIndex::LimitedIndexCaching)
35 if ( m_reader && m_reader->IsOpen() )
36 m_references = m_reader->GetReferenceData();
40 BamIndex::~BamIndex(void) {
42 fclose(m_indexStream);
45 // return true if FILE* is open
46 bool BamIndex::IsOpen(void) const {
47 return ( m_indexStream != 0 );
50 // loads existing data from file into memory
51 bool BamIndex::Load(const string& filename) {
53 // open index file, abort on error
54 if ( !OpenIndexFile(filename, "rb") ) {
55 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
60 if ( !LoadHeader() ) {
61 fclose(m_indexStream);
65 // load reference data (but only keep in memory if full caching requested)
66 bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
67 if ( !LoadAllReferences(saveInitialLoad) ) {
68 fclose(m_indexStream);
72 // update index cache based on selected mode
79 // opens index file for reading/writing, return true if opened OK
80 bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
81 m_indexStream = fopen(filename.c_str(), mode.c_str());
82 return ( m_indexStream != 0 );
85 // rewind index file to beginning of index data, return true if rewound OK
86 bool BamIndex::Rewind(void) {
87 return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
90 // change the index caching behavior
91 void BamIndex::SetCacheMode(const BamIndexCacheMode mode) {
92 if ( mode != m_cacheMode ) {
98 // updates in-memory cache of index data, depending on current cache mode
99 void BamIndex::UpdateCache(void) {
101 // skip if file not open
102 if ( !IsOpen() ) return;
104 // reflect requested cache mode behavior
105 switch ( m_cacheMode ) {
107 case (BamIndex::FullIndexCaching) :
109 LoadAllReferences(true);
112 case (BamIndex::LimitedIndexCaching) :
113 if ( HasFullDataCache() )
114 KeepOnlyFirstReferenceOffsets();
117 SkipToFirstReference();
118 LoadFirstReference(true);
121 case(BamIndex::NoIndexCaching) :
130 // writes in-memory index data out to file
131 bool BamIndex::Write(const string& bamFilename) {
133 // open index file for writing
134 string indexFilename = bamFilename + Extension();
135 if ( !OpenIndexFile(indexFilename, "wb") ) {
136 fprintf(stderr, "ERROR: Could not open file to save index.\n");
140 // write index header data
141 if ( !WriteHeader() ) {
142 fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n");
143 fflush(m_indexStream);
144 fclose(m_indexStream);
148 // write main index data
149 if ( !WriteAllReferences() ) {
150 fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n");
151 fflush(m_indexStream);
152 fclose(m_indexStream);
156 // flush any remaining output, rewind file, and return success
157 fflush(m_indexStream);
158 fclose(m_indexStream);
160 // re-open index file for later reading
161 if ( !OpenIndexFile(indexFilename, "rb") ) {
162 fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n");
166 // return success/failure of write
170 // #########################################################################################
171 // #########################################################################################
173 // -------------------------------
174 // BamStandardIndex structs & typedefs
178 // BAM index constants
179 const int MAX_BIN = 37450; // =(8^6-1)/7+1
180 const int BAM_LIDX_SHIFT = 14;
182 // --------------------------------------------------
183 // BamStandardIndex data structures & typedefs
191 Chunk(const uint64_t& start = 0,
192 const uint64_t& stop = 0)
198 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
199 return lhs.Start < rhs.Start;
202 typedef vector<Chunk> ChunkVector;
203 typedef map<uint32_t, ChunkVector> BamBinMap;
204 typedef vector<uint64_t> LinearOffsetVector;
206 struct ReferenceIndex {
210 LinearOffsetVector Offsets;
214 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
215 const LinearOffsetVector& offsets = LinearOffsetVector(),
216 const bool hasAlignments = false)
219 , HasAlignments(hasAlignments)
223 typedef map<int32_t, ReferenceIndex> BamStandardIndexData;
225 } // namespace BamTools
227 // -------------------------------
228 // BamStandardIndexPrivate implementation
230 struct BamStandardIndex::BamStandardIndexPrivate {
233 BamStandardIndex* m_parent;
236 BamStandardIndexData m_indexData;
237 off_t m_dataBeginOffset;
238 bool m_hasFullDataCache;
242 BamStandardIndexPrivate(BamStandardIndex* parent);
243 ~BamStandardIndexPrivate(void);
245 // parent interface methods
248 // creates index data (in-memory) from current reader data
250 // clear all current index offset data in memory
251 void ClearAllData(void);
252 // return file position after header metadata
253 const off_t DataBeginOffset(void) const;
254 // returns whether reference has alignments or no
255 bool HasAlignments(const int& referenceID) const;
256 // return true if all index data is cached
257 bool HasFullDataCache(void) const;
258 // attempts to use index to jump to region; returns success/fail
259 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
260 // clears index data from all references except the first reference
261 void KeepOnlyFirstReferenceOffsets(void);
262 // load index data for all references, return true if loaded OK
263 // @saveData - save data in memory if true, just read & discard if false
264 bool LoadAllReferences(bool saveData = true);
265 // load first reference from file, return true if loaded OK
266 // @saveData - save data in memory if true, just read & discard if false
267 bool LoadFirstReference(bool saveData = true);
268 // load header data from index file, return true if loaded OK
269 bool LoadHeader(void);
270 // position file pointer to first reference begin, return true if skipped OK
271 bool SkipToFirstReference(void);
272 // write header to new index file
273 bool WriteHeader(void);
274 // write index data for all references to new index file
275 bool WriteAllReferences(void);
280 // -----------------------
281 // index file operations
283 // check index file magic number, return true if OK
284 bool CheckMagicNumber(void);
285 // check index file version, return true if OK
286 bool CheckVersion(void);
287 // load a single index bin entry from file, return true if loaded OK
288 // @saveData - save data in memory if true, just read & discard if false
289 bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
290 bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
291 // load a single index bin entry from file, return true if loaded OK
292 // @saveData - save data in memory if true, just read & discard if false
293 bool LoadChunk(ChunkVector& chunks, bool saveData = true);
294 bool LoadChunks(ChunkVector& chunks, bool saveData = true);
295 // load a single index linear offset entry from file, return true if loaded OK
296 // @saveData - save data in memory if true, just read & discard if false
297 bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
298 // load a single reference from file, return true if loaded OK
299 // @saveData - save data in memory if true, just read & discard if false
300 bool LoadReference(const int& refId, bool saveData = true);
301 // loads number of references, return true if loaded OK
302 bool LoadReferenceCount(int& numReferences);
303 // position file pointer to desired reference begin, return true if skipped OK
304 bool SkipToReference(const int& refId);
305 // write index data for bin to new index file
306 bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
307 // write index data for bins to new index file
308 bool WriteBins(const BamBinMap& bins);
309 // write index data for chunk entry to new index file
310 bool WriteChunk(const Chunk& chunk);
311 // write index data for chunk entry to new index file
312 bool WriteChunks(const ChunkVector& chunks);
313 // write index data for linear offsets entry to new index file
314 bool WriteLinearOffsets(const LinearOffsetVector& offsets);
315 // write index data single reference to new index file
316 bool WriteReference(const ReferenceIndex& refEntry);
318 // -----------------------
319 // index data operations
321 // calculate bins that overlap region
322 int BinsFromRegion(const BamRegion& region,
323 const bool isRightBoundSpecified,
324 uint16_t bins[BamTools::MAX_BIN]);
325 // clear all index offset data for desired reference
326 void ClearReferenceOffsets(const int& refId);
327 // calculates offset(s) for a given region
328 bool GetOffsets(const BamRegion& region,
329 const bool isRightBoundSpecified,
330 vector<int64_t>& offsets,
331 bool* hasAlignmentsInRegion);
332 // returns true if index cache has data for desired reference
333 bool IsDataLoaded(const int& refId) const;
334 // clears index data from all references except the one specified
335 void KeepOnlyReferenceOffsets(const int& refId);
336 // simplifies index by merging 'chunks'
337 void MergeChunks(void);
338 // saves BAM bin entry for index
339 void SaveBinEntry(BamBinMap& binMap,
340 const uint32_t& saveBin,
341 const uint64_t& saveOffset,
342 const uint64_t& lastOffset);
343 // saves linear offset entry for index
344 void SaveLinearOffset(LinearOffsetVector& offsets,
345 const BamAlignment& bAlignment,
346 const uint64_t& lastOffset);
347 // initializes index data structure to hold @count references
348 void SetReferenceCount(const int& count);
352 BamStandardIndex::BamStandardIndexPrivate::BamStandardIndexPrivate(BamStandardIndex* parent)
354 , m_dataBeginOffset(0)
355 , m_hasFullDataCache(false)
357 m_isBigEndian = BamTools::SystemIsBigEndian();
360 BamStandardIndex::BamStandardIndexPrivate::~BamStandardIndexPrivate(void) {
364 // calculate bins that overlap region
365 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region,
366 const bool isRightBoundSpecified,
367 uint16_t bins[MAX_BIN])
369 // get region boundaries
370 uint32_t begin = (unsigned int)region.LeftPosition;
373 // if right bound specified AND left&right bounds are on same reference
374 // OK to use right bound position
375 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
376 end = (unsigned int)region.RightPosition;
378 // otherwise, use end of left bound reference as cutoff
380 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
382 // initialize list, bin '0' always a valid bin
386 // get rest of bins that contain this region
388 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
389 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
390 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
391 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
392 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
394 // return number of bins stored
398 // creates index data (in-memory) from current reader data
399 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
401 // localize parent data
402 if ( m_parent == 0 ) return false;
403 BamReader* reader = m_parent->m_reader;
404 BgzfData* mBGZF = m_parent->m_BGZF;
406 // be sure reader & BGZF file are valid & open for reading
407 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
410 // move file pointer to beginning of alignments
413 // get reference count, reserve index space
414 const int numReferences = (int)m_parent->m_references.size();
416 m_hasFullDataCache = false;
417 SetReferenceCount(numReferences);
419 // sets default constant for bin, ID, offset, coordinate variables
420 const uint32_t defaultValue = 0xffffffffu;
423 uint32_t saveBin(defaultValue);
424 uint32_t lastBin(defaultValue);
427 int32_t saveRefID(defaultValue);
428 int32_t lastRefID(defaultValue);
431 uint64_t saveOffset = mBGZF->Tell();
432 uint64_t lastOffset = saveOffset;
435 int32_t lastCoordinate = defaultValue;
437 BamAlignment bAlignment;
438 while ( reader->GetNextAlignmentCore(bAlignment) ) {
440 // change of chromosome, save ID, reset bin
441 if ( lastRefID != bAlignment.RefID ) {
442 lastRefID = bAlignment.RefID;
443 lastBin = defaultValue;
446 // if lastCoordinate greater than BAM position - file not sorted properly
447 else if ( lastCoordinate > bAlignment.Position ) {
448 fprintf(stderr, "BAM file not properly sorted:\n");
449 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(),
450 lastCoordinate, bAlignment.Position, bAlignment.RefID);
454 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
455 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
457 // save linear offset entry (matched to BAM entry refID)
458 BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID);
459 if ( indexIter == m_indexData.end() ) return false; // error
460 ReferenceIndex& refIndex = (*indexIter).second;
461 LinearOffsetVector& offsets = refIndex.Offsets;
462 SaveLinearOffset(offsets, bAlignment, lastOffset);
465 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
466 if ( bAlignment.Bin != lastBin ) {
468 // if not first time through
469 if ( saveBin != defaultValue ) {
471 // save Bam bin entry
472 BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
473 if ( indexIter == m_indexData.end() ) return false; // error
474 ReferenceIndex& refIndex = (*indexIter).second;
475 BamBinMap& binMap = refIndex.Bins;
476 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
480 saveOffset = lastOffset;
483 saveBin = bAlignment.Bin;
484 lastBin = bAlignment.Bin;
487 saveRefID = bAlignment.RefID;
489 // if invalid RefID, break out
490 if ( saveRefID < 0 ) break;
493 // make sure that current file pointer is beyond lastOffset
494 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
495 fprintf(stderr, "Error in BGZF offsets.\n");
500 lastOffset = mBGZF->Tell();
502 // update lastCoordinate
503 lastCoordinate = bAlignment.Position;
506 // save any leftover BAM data (as long as refID is valid)
507 if ( saveRefID >= 0 ) {
508 // save Bam bin entry
509 BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
510 if ( indexIter == m_indexData.end() ) return false; // error
511 ReferenceIndex& refIndex = (*indexIter).second;
512 BamBinMap& binMap = refIndex.Bins;
513 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
516 // simplify index by merging chunks
519 // iterate through references in index
520 // sort offsets in linear offset vector
521 BamStandardIndexData::iterator indexIter = m_indexData.begin();
522 BamStandardIndexData::iterator indexEnd = m_indexData.end();
523 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
525 // get reference index data
526 ReferenceIndex& refIndex = (*indexIter).second;
527 LinearOffsetVector& offsets = refIndex.Offsets;
529 // sort linear offsets
530 sort(offsets.begin(), offsets.end());
533 // rewind file pointer to beginning of alignments, return success/fail
534 return reader->Rewind();
537 // check index file magic number, return true if OK
538 bool BamStandardIndex::BamStandardIndexPrivate::CheckMagicNumber(void) {
540 // read in magic number
542 size_t elementsRead = fread(magic, sizeof(char), 4, m_parent->m_indexStream);
544 // compare to expected value
545 if ( strncmp(magic, "BAI\1", 4) != 0 ) {
546 fprintf(stderr, "Problem with index file - invalid format.\n");
547 fclose(m_parent->m_indexStream);
551 // return success/failure of load
552 return (elementsRead == 4);
555 // clear all current index offset data in memory
556 void BamStandardIndex::BamStandardIndexPrivate::ClearAllData(void) {
557 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
558 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
559 for ( ; indexIter != indexEnd; ++indexIter ) {
560 const int& refId = (*indexIter).first;
561 ClearReferenceOffsets(refId);
565 // clear all index offset data for desired reference
566 void BamStandardIndex::BamStandardIndexPrivate::ClearReferenceOffsets(const int& refId) {
568 // look up refId, skip if not found
569 BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
570 if ( indexIter == m_indexData.end() ) return ;
572 // clear reference data
573 ReferenceIndex& refEntry = (*indexIter).second;
574 refEntry.Bins.clear();
575 refEntry.Offsets.clear();
578 m_hasFullDataCache = false;
581 // return file position after header metadata
582 const off_t BamStandardIndex::BamStandardIndexPrivate::DataBeginOffset(void) const {
583 return m_dataBeginOffset;
586 // calculates offset(s) for a given region
587 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region,
588 const bool isRightBoundSpecified,
589 vector<int64_t>& offsets,
590 bool* hasAlignmentsInRegion)
592 // return false if leftBound refID is not found in index data
593 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
596 // load index data for region if not already cached
597 if ( !IsDataLoaded(region.LeftRefID) ) {
598 bool loadedOk = true;
599 loadedOk &= SkipToReference(region.LeftRefID);
600 loadedOk &= LoadReference(region.LeftRefID);
601 if ( !loadedOk ) return false;
604 // calculate which bins overlap this region
605 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
606 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
608 // get bins for this reference
609 BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
610 if ( indexIter == m_indexData.end() ) return false; // error
611 const ReferenceIndex& refIndex = (*indexIter).second;
612 const BamBinMap& binMap = refIndex.Bins;
614 // get minimum offset to consider
615 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
616 const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
617 ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
619 // store all alignment 'chunk' starts (file offsets) for bins in this region
620 for ( int i = 0; i < numBins; ++i ) {
622 const uint16_t binKey = bins[i];
623 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
624 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
626 // iterate over chunks
627 const ChunkVector& chunks = (*binIter).second;
628 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
629 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
630 for ( ; chunksIter != chunksEnd; ++chunksIter) {
632 // if valid chunk found, store its file offset
633 const Chunk& chunk = (*chunksIter);
634 if ( chunk.Stop > minOffset )
635 offsets.push_back( chunk.Start );
643 // sort the offsets before returning
644 sort(offsets.begin(), offsets.end());
646 // set flag & return success
647 *hasAlignmentsInRegion = (offsets.size() != 0 );
649 // if cache mode set to none, dump the data we just loaded
650 if (m_parent->m_cacheMode == BamIndex::NoIndexCaching )
651 ClearReferenceOffsets(region.LeftRefID);
657 // returns whether reference has alignments or no
658 bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& refId) const {
659 BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
660 if ( indexIter == m_indexData.end() ) return false; // error
661 const ReferenceIndex& refEntry = (*indexIter).second;
662 return refEntry.HasAlignments;
665 // return true if all index data is cached
666 bool BamStandardIndex::BamStandardIndexPrivate::HasFullDataCache(void) const {
667 return m_hasFullDataCache;
670 // returns true if index cache has data for desired reference
671 bool BamStandardIndex::BamStandardIndexPrivate::IsDataLoaded(const int& refId) const {
673 // look up refId, return false if not found
674 BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
675 if ( indexIter == m_indexData.end() ) return false;
677 // see if reference has alignments
678 // if not, it's not a problem to have no offset data
679 const ReferenceIndex& refEntry = (*indexIter).second;
680 if ( !refEntry.HasAlignments ) return true;
682 // return whether bin map contains data
683 return ( !refEntry.Bins.empty() );
686 // attempts to use index to jump to region; returns success/fail
687 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
689 // localize parent data
690 if ( m_parent == 0 ) return false;
691 BamReader* reader = m_parent->m_reader;
692 BgzfData* mBGZF = m_parent->m_BGZF;
693 RefVector& references = m_parent->m_references;
695 // be sure reader & BGZF file are valid & open for reading
696 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
699 // make sure left-bound position is valid
700 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
703 // calculate offsets for this region
704 // if failed, print message, set flag, and return failure
705 vector<int64_t> offsets;
706 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
707 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
708 *hasAlignmentsInRegion = false;
712 // iterate through offsets
713 BamAlignment bAlignment;
715 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
717 // attempt seek & load first available alignment
718 // set flag to true if data exists
719 result &= mBGZF->Seek(*o);
720 *hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment);
722 // if this alignment corresponds to desired position
723 // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
724 if ( ((bAlignment.RefID == region.LeftRefID) &&
725 ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) ||
726 (bAlignment.RefID > region.LeftRefID) )
728 if ( o != offsets.begin() ) --o;
729 return mBGZF->Seek(*o);
733 // if error in jumping, print message & set flag
735 fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
736 *hasAlignmentsInRegion = false;
739 // return success/failure
743 // clears index data from all references except the first
744 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
745 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
746 KeepOnlyReferenceOffsets((*indexBegin).first);
749 // clears index data from all references except the one specified
750 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
751 BamStandardIndexData::iterator mapIter = m_indexData.begin();
752 BamStandardIndexData::iterator mapEnd = m_indexData.end();
753 for ( ; mapIter != mapEnd; ++mapIter ) {
754 const int entryRefId = (*mapIter).first;
755 if ( entryRefId != refId )
756 ClearReferenceOffsets(entryRefId);
760 bool BamStandardIndex::BamStandardIndexPrivate::LoadAllReferences(bool saveData) {
762 // skip if data already loaded
763 if ( m_hasFullDataCache ) return true;
765 // get number of reference sequences
766 uint32_t numReferences;
767 if ( !LoadReferenceCount((int&)numReferences) )
770 // iterate over reference entries
771 bool loadedOk = true;
772 for ( int i = 0; i < (int)numReferences; ++i )
773 loadedOk &= LoadReference(i, saveData);
776 if ( loadedOk && saveData )
777 m_hasFullDataCache = true;
779 // return success/failure of loading references
783 // load header data from index file, return true if loaded OK
784 bool BamStandardIndex::BamStandardIndexPrivate::LoadHeader(void) {
786 bool loadedOk = CheckMagicNumber();
788 // store offset of beginning of data
789 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
791 // return success/failure of load
795 // load a single index bin entry from file, return true if loaded OK
796 // @saveData - save data in memory if true, just read & discard if false
797 bool BamStandardIndex::BamStandardIndexPrivate::LoadBin(ReferenceIndex& refEntry, bool saveData) {
799 size_t elementsRead = 0;
803 elementsRead += fread(&binId, sizeof(binId), 1, m_parent->m_indexStream);
804 if ( m_isBigEndian ) SwapEndian_32(binId);
806 // load alignment chunks for this bin
808 bool chunksOk = LoadChunks(chunks, saveData);
811 if ( chunksOk && saveData )
812 refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
814 // return success/failure of load
815 return ( (elementsRead == 1) && chunksOk );
818 bool BamStandardIndex::BamStandardIndexPrivate::LoadBins(ReferenceIndex& refEntry, bool saveData) {
820 size_t elementsRead = 0;
822 // get number of bins
824 elementsRead += fread(&numBins, sizeof(numBins), 1, m_parent->m_indexStream);
825 if ( m_isBigEndian ) SwapEndian_32(numBins);
828 refEntry.HasAlignments = ( numBins != 0 );
832 for ( int i = 0; i < numBins; ++i )
833 binsOk &= LoadBin(refEntry, saveData);
835 // return success/failure of load
836 return ( (elementsRead == 1) && binsOk );
839 // load a single index bin entry from file, return true if loaded OK
840 // @saveData - save data in memory if true, just read & discard if false
841 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunk(ChunkVector& chunks, bool saveData) {
843 size_t elementsRead = 0;
845 // read in chunk data
848 elementsRead += fread(&start, sizeof(start), 1, m_parent->m_indexStream);
849 elementsRead += fread(&stop, sizeof(stop), 1, m_parent->m_indexStream);
851 // swap endian-ness if necessary
852 if ( m_isBigEndian ) {
853 SwapEndian_64(start);
857 // save data if requested
858 if ( saveData ) chunks.push_back( Chunk(start, stop) );
860 // return success/failure of load
861 return ( elementsRead == 2 );
864 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunks(ChunkVector& chunks, bool saveData) {
866 size_t elementsRead = 0;
868 // read in number of chunks
870 elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_parent->m_indexStream);
871 if ( m_isBigEndian ) SwapEndian_32(numChunks);
873 // initialize space for chunks if we're storing this data
874 if ( saveData ) chunks.reserve(numChunks);
876 // iterate over chunks
877 bool chunksOk = true;
878 for ( int i = 0; i < (int)numChunks; ++i )
879 chunksOk &= LoadChunk(chunks, saveData);
882 sort( chunks.begin(), chunks.end(), ChunkLessThan );
884 // return success/failure of load
885 return ( (elementsRead == 1) && chunksOk );
888 // load a single index linear offset entry from file, return true if loaded OK
889 // @saveData - save data in memory if true, just read & discard if false
890 bool BamStandardIndex::BamStandardIndexPrivate::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
892 size_t elementsRead = 0;
894 // read in number of linear offsets
895 int32_t numLinearOffsets;
896 elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_parent->m_indexStream);
897 if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
899 // set up destination vector (if we're saving the data)
900 LinearOffsetVector linearOffsets;
901 if ( saveData ) linearOffsets.reserve(numLinearOffsets);
903 // iterate over linear offsets
904 uint64_t linearOffset;
905 for ( int i = 0; i < numLinearOffsets; ++i ) {
906 elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
907 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
908 if ( saveData ) linearOffsets.push_back(linearOffset);
911 // sort linear offsets
912 sort ( linearOffsets.begin(), linearOffsets.end() );
914 // save in reference index entry if desired
915 if ( saveData ) refEntry.Offsets = linearOffsets;
917 // return success/failure of load
918 return ( elementsRead == (size_t)(numLinearOffsets + 1) );
921 bool BamStandardIndex::BamStandardIndexPrivate::LoadFirstReference(bool saveData) {
922 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
923 return LoadReference((*indexBegin).first, saveData);
926 // load a single reference from file, return true if loaded OK
927 // @saveData - save data in memory if true, just read & discard if false
928 bool BamStandardIndex::BamStandardIndexPrivate::LoadReference(const int& refId, bool saveData) {
931 BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
933 // if reference not previously loaded, create new entry
934 if ( indexIter == m_indexData.end() ) {
935 ReferenceIndex newEntry;
936 newEntry.HasAlignments = false;
937 m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
940 // load reference data
941 indexIter = m_indexData.find(refId);
942 ReferenceIndex& entry = (*indexIter).second;
943 bool loadedOk = true;
944 loadedOk &= LoadBins(entry, saveData);
945 loadedOk &= LoadLinearOffsets(entry, saveData);
949 // loads number of references, return true if loaded OK
950 bool BamStandardIndex::BamStandardIndexPrivate::LoadReferenceCount(int& numReferences) {
952 size_t elementsRead = 0;
954 // read reference count
955 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
956 if ( m_isBigEndian ) SwapEndian_32(numReferences);
958 // return success/failure of load
959 return ( elementsRead == 1 );
962 // merges 'alignment chunks' in BAM bin (used for index building)
963 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
965 // iterate over reference enties
966 BamStandardIndexData::iterator indexIter = m_indexData.begin();
967 BamStandardIndexData::iterator indexEnd = m_indexData.end();
968 for ( ; indexIter != indexEnd; ++indexIter ) {
970 // get BAM bin map for this reference
971 ReferenceIndex& refIndex = (*indexIter).second;
972 BamBinMap& bamBinMap = refIndex.Bins;
974 // iterate over BAM bins
975 BamBinMap::iterator binIter = bamBinMap.begin();
976 BamBinMap::iterator binEnd = bamBinMap.end();
977 for ( ; binIter != binEnd; ++binIter ) {
979 // get chunk vector for this bin
980 ChunkVector& binChunks = (*binIter).second;
981 if ( binChunks.size() == 0 ) continue;
983 ChunkVector mergedChunks;
984 mergedChunks.push_back( binChunks[0] );
986 // iterate over chunks
988 ChunkVector::iterator chunkIter = binChunks.begin();
989 ChunkVector::iterator chunkEnd = binChunks.end();
990 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
992 // get 'currentChunk' based on numeric index
993 Chunk& currentChunk = mergedChunks[i];
995 // get iteratorChunk based on vector iterator
996 Chunk& iteratorChunk = (*chunkIter);
998 // if chunk ends where (iterator) chunk starts, then merge
999 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
1000 currentChunk.Stop = iteratorChunk.Stop;
1004 // set currentChunk + 1 to iteratorChunk
1005 mergedChunks.push_back(iteratorChunk);
1010 // saved merged chunk vector
1011 (*binIter).second = mergedChunks;
1016 // saves BAM bin entry for index
1017 void BamStandardIndex::BamStandardIndexPrivate::SaveBinEntry(BamBinMap& binMap,
1018 const uint32_t& saveBin,
1019 const uint64_t& saveOffset,
1020 const uint64_t& lastOffset)
1023 BamBinMap::iterator binIter = binMap.find(saveBin);
1026 Chunk newChunk(saveOffset, lastOffset);
1028 // if entry doesn't exist
1029 if ( binIter == binMap.end() ) {
1030 ChunkVector newChunks;
1031 newChunks.push_back(newChunk);
1032 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
1037 ChunkVector& binChunks = (*binIter).second;
1038 binChunks.push_back( newChunk );
1042 // saves linear offset entry for index
1043 void BamStandardIndex::BamStandardIndexPrivate::SaveLinearOffset(LinearOffsetVector& offsets,
1044 const BamAlignment& bAlignment,
1045 const uint64_t& lastOffset)
1047 // get converted offsets
1048 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
1049 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
1051 // resize vector if necessary
1052 int oldSize = offsets.size();
1053 int newSize = endOffset + 1;
1054 if ( oldSize < newSize )
1055 offsets.resize(newSize, 0);
1058 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
1059 if ( offsets[i] == 0 )
1060 offsets[i] = lastOffset;
1064 // initializes index data structure to hold @count references
1065 void BamStandardIndex::BamStandardIndexPrivate::SetReferenceCount(const int& count) {
1066 for ( int i = 0; i < count; ++i )
1067 m_indexData[i].HasAlignments = false;
1070 bool BamStandardIndex::BamStandardIndexPrivate::SkipToFirstReference(void) {
1071 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
1072 return SkipToReference( (*indexBegin).first );
1075 // position file pointer to desired reference begin, return true if skipped OK
1076 bool BamStandardIndex::BamStandardIndexPrivate::SkipToReference(const int& refId) {
1079 if ( !m_parent->Rewind() ) return false;
1081 // read in number of references
1082 uint32_t numReferences;
1083 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1084 if ( elementsRead != 1 ) return false;
1085 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1087 // iterate over reference entries
1088 bool skippedOk = true;
1089 int currentRefId = 0;
1090 while (currentRefId != refId) {
1091 skippedOk &= LoadReference(currentRefId, false);
1099 // write header to new index file
1100 bool BamStandardIndex::BamStandardIndexPrivate::WriteHeader(void) {
1102 size_t elementsWritten = 0;
1104 // write magic number
1105 elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_parent->m_indexStream);
1107 // store offset of beginning of data
1108 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1110 // return success/failure of write
1111 return (elementsWritten == 4);
1114 // write index data for all references to new index file
1115 bool BamStandardIndex::BamStandardIndexPrivate::WriteAllReferences(void) {
1117 size_t elementsWritten = 0;
1119 // write number of reference sequences
1120 int32_t numReferenceSeqs = m_indexData.size();
1121 if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
1122 elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_parent->m_indexStream);
1124 // iterate over reference sequences
1126 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
1127 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
1128 for ( ; indexIter != indexEnd; ++ indexIter )
1129 refsOk &= WriteReference( (*indexIter).second );
1131 // return success/failure of write
1132 return ( (elementsWritten == 1) && refsOk );
1135 // write index data for bin to new index file
1136 bool BamStandardIndex::BamStandardIndexPrivate::WriteBin(const uint32_t& binId, const ChunkVector& chunks) {
1138 size_t elementsWritten = 0;
1141 uint32_t binKey = binId;
1142 if ( m_isBigEndian ) SwapEndian_32(binKey);
1143 elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_parent->m_indexStream);
1146 bool chunksOk = WriteChunks(chunks);
1148 // return success/failure of write
1149 return ( (elementsWritten == 1) && chunksOk );
1152 // write index data for bins to new index file
1153 bool BamStandardIndex::BamStandardIndexPrivate::WriteBins(const BamBinMap& bins) {
1155 size_t elementsWritten = 0;
1157 // write number of bins
1158 int32_t binCount = bins.size();
1159 if ( m_isBigEndian ) SwapEndian_32(binCount);
1160 elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_parent->m_indexStream);
1162 // iterate over bins
1164 BamBinMap::const_iterator binIter = bins.begin();
1165 BamBinMap::const_iterator binEnd = bins.end();
1166 for ( ; binIter != binEnd; ++binIter )
1167 binsOk &= WriteBin( (*binIter).first, (*binIter).second );
1169 // return success/failure of write
1170 return ( (elementsWritten == 1) && binsOk );
1173 // write index data for chunk entry to new index file
1174 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunk(const Chunk& chunk) {
1176 size_t elementsWritten = 0;
1178 // localize alignment chunk offsets
1179 uint64_t start = chunk.Start;
1180 uint64_t stop = chunk.Stop;
1182 // swap endian-ness if necessary
1183 if ( m_isBigEndian ) {
1184 SwapEndian_64(start);
1185 SwapEndian_64(stop);
1188 // write to index file
1189 elementsWritten += fwrite(&start, sizeof(start), 1, m_parent->m_indexStream);
1190 elementsWritten += fwrite(&stop, sizeof(stop), 1, m_parent->m_indexStream);
1192 // return success/failure of write
1193 return ( elementsWritten == 2 );
1196 // write index data for chunk entry to new index file
1197 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunks(const ChunkVector& chunks) {
1199 size_t elementsWritten = 0;
1202 int32_t chunkCount = chunks.size();
1203 if ( m_isBigEndian ) SwapEndian_32(chunkCount);
1204 elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_parent->m_indexStream);
1206 // iterate over chunks
1207 bool chunksOk = true;
1208 ChunkVector::const_iterator chunkIter = chunks.begin();
1209 ChunkVector::const_iterator chunkEnd = chunks.end();
1210 for ( ; chunkIter != chunkEnd; ++chunkIter )
1211 chunksOk &= WriteChunk( (*chunkIter) );
1213 // return success/failure of write
1214 return ( (elementsWritten == 1) && chunksOk );
1217 // write index data for linear offsets entry to new index file
1218 bool BamStandardIndex::BamStandardIndexPrivate::WriteLinearOffsets(const LinearOffsetVector& offsets) {
1220 size_t elementsWritten = 0;
1222 // write number of linear offsets
1223 int32_t offsetCount = offsets.size();
1224 if ( m_isBigEndian ) SwapEndian_32(offsetCount);
1225 elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_parent->m_indexStream);
1227 // iterate over linear offsets
1228 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
1229 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
1230 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1232 // write linear offset
1233 uint64_t linearOffset = (*offsetIter);
1234 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
1235 elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
1238 // return success/failure of write
1239 return ( elementsWritten == (size_t)(offsetCount + 1) );
1242 // write index data for a single reference to new index file
1243 bool BamStandardIndex::BamStandardIndexPrivate::WriteReference(const ReferenceIndex& refEntry) {
1245 refOk &= WriteBins(refEntry.Bins);
1246 refOk &= WriteLinearOffsets(refEntry.Offsets);
1250 // ---------------------------------------------------------------
1251 // BamStandardIndex implementation
1253 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader)
1254 : BamIndex(bgzf, reader)
1256 d = new BamStandardIndexPrivate(this);
1259 BamStandardIndex::~BamStandardIndex(void) {
1264 // BamIndex interface implementation
1265 bool BamStandardIndex::Build(void) { return d->Build(); }
1266 void BamStandardIndex::ClearAllData(void) { d->ClearAllData(); }
1267 const off_t BamStandardIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
1268 bool BamStandardIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
1269 bool BamStandardIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
1270 bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
1271 void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
1272 bool BamStandardIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
1273 bool BamStandardIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
1274 bool BamStandardIndex::LoadHeader(void) { return d->LoadHeader(); }
1275 bool BamStandardIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
1276 bool BamStandardIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
1277 bool BamStandardIndex::WriteHeader(void) { return d->WriteHeader(); }
1279 // #########################################################################################
1280 // #########################################################################################
1282 // ---------------------------------------------------
1283 // BamToolsIndex structs & typedefs
1285 namespace BamTools {
1287 // individual index offset entry
1288 struct BamToolsIndexEntry {
1291 int32_t MaxEndPosition;
1292 int64_t StartOffset;
1293 int32_t StartPosition;
1296 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
1297 const int64_t& startOffset = 0,
1298 const int32_t& startPosition = 0)
1299 : MaxEndPosition(maxEndPosition)
1300 , StartOffset(startOffset)
1301 , StartPosition(startPosition)
1305 // reference index entry
1306 struct BamToolsReferenceEntry {
1310 vector<BamToolsIndexEntry> Offsets;
1313 BamToolsReferenceEntry(void)
1314 : HasAlignments(false)
1318 // the actual index data structure
1319 typedef map<int, BamToolsReferenceEntry> BamToolsIndexData;
1321 } // namespace BamTools
1323 // ---------------------------------------------------
1324 // BamToolsIndexPrivate implementation
1326 struct BamToolsIndex::BamToolsIndexPrivate {
1328 // keep a list of any supported versions here
1329 // (might be useful later to handle any 'legacy' versions if the format changes)
1330 // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
1332 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
1334 // if ( indexVersion >= BTI_1_2 )
1338 enum Version { BTI_1_0 = 1
1344 BamToolsIndex* m_parent;
1347 int32_t m_blockSize;
1348 BamToolsIndexData m_indexData;
1349 off_t m_dataBeginOffset;
1350 bool m_hasFullDataCache;
1352 int32_t m_inputVersion; // Version is serialized as int
1353 Version m_outputVersion;
1356 BamToolsIndexPrivate(BamToolsIndex* parent);
1357 ~BamToolsIndexPrivate(void);
1359 // parent interface methods
1362 // creates index data (in-memory) from current reader data
1364 // clear all current index offset data in memory
1365 void ClearAllData(void);
1366 // return file position after header metadata
1367 const off_t DataBeginOffset(void) const;
1368 // returns whether reference has alignments or no
1369 bool HasAlignments(const int& referenceID) const;
1370 // return true if all index data is cached
1371 bool HasFullDataCache(void) const;
1372 // attempts to use index to jump to region; returns success/fail
1373 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
1374 // clears index data from all references except the first
1375 void KeepOnlyFirstReferenceOffsets(void);
1376 // load index data for all references, return true if loaded OK
1377 // @saveData - save data in memory if true, just read & discard if false
1378 bool LoadAllReferences(bool saveData = true);
1379 // load first reference from file, return true if loaded OK
1380 // @saveData - save data in memory if true, just read & discard if false
1381 bool LoadFirstReference(bool saveData = true);
1382 // load header data from index file, return true if loaded OK
1383 bool LoadHeader(void);
1384 // position file pointer to desired reference begin, return true if skipped OK
1385 bool SkipToFirstReference(void);
1386 // write header to new index file
1387 bool WriteHeader(void);
1388 // write index data for all references to new index file
1389 bool WriteAllReferences(void);
1394 // -----------------------
1395 // index file operations
1397 // check index file magic number, return true if OK
1398 bool CheckMagicNumber(void);
1399 // check index file version, return true if OK
1400 bool CheckVersion(void);
1401 // return true if FILE* is open
1402 bool IsOpen(void) const;
1403 // load a single index entry from file, return true if loaded OK
1404 // @saveData - save data in memory if true, just read & discard if false
1405 bool LoadIndexEntry(const int& refId, bool saveData = true);
1406 // load a single reference from file, return true if loaded OK
1407 // @saveData - save data in memory if true, just read & discard if false
1408 bool LoadReference(const int& refId, bool saveData = true);
1409 // loads number of references, return true if loaded OK
1410 bool LoadReferenceCount(int& numReferences);
1411 // position file pointer to desired reference begin, return true if skipped OK
1412 bool SkipToReference(const int& refId);
1413 // write current reference index data to new index file
1414 bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry);
1415 // write current index offset entry to new index file
1416 bool WriteIndexEntry(const BamToolsIndexEntry& entry);
1418 // -----------------------
1419 // index data operations
1421 // clear all index offset data for desired reference
1422 void ClearReferenceOffsets(const int& refId);
1423 // calculate BAM file offset for desired region
1424 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1425 // check @hasAlignmentsInRegion to determine this status
1426 // @region - target region
1427 // @offset - resulting seek target
1428 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1429 bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
1430 // returns true if index cache has data for desired reference
1431 bool IsDataLoaded(const int& refId) const;
1432 // clears index data from all references except the one specified
1433 void KeepOnlyReferenceOffsets(const int& refId);
1434 // saves an index offset entry in memory
1435 void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry);
1436 // pre-allocates size for offset vector
1437 void SetOffsetCount(const int& refId, const int& offsetCount);
1438 // initializes index data structure to hold @count references
1439 void SetReferenceCount(const int& count);
1443 BamToolsIndex::BamToolsIndexPrivate::BamToolsIndexPrivate(BamToolsIndex* parent)
1446 , m_dataBeginOffset(0)
1447 , m_hasFullDataCache(false)
1449 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
1451 m_isBigEndian = BamTools::SystemIsBigEndian();
1455 BamToolsIndex::BamToolsIndexPrivate::~BamToolsIndexPrivate(void) {
1459 // creates index data (in-memory) from current reader data
1460 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
1462 // localize parent data
1463 if ( m_parent == 0 ) return false;
1464 BamReader* reader = m_parent->m_reader;
1465 BgzfData* mBGZF = m_parent->m_BGZF;
1467 // be sure reader & BGZF file are valid & open for reading
1468 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
1471 // move file pointer to beginning of alignments
1472 if ( !reader->Rewind() ) return false;
1474 // initialize index data structure with space for all references
1475 const int numReferences = (int)m_parent->m_references.size();
1476 m_indexData.clear();
1477 m_hasFullDataCache = false;
1478 SetReferenceCount(numReferences);
1480 // set up counters and markers
1481 int32_t currentBlockCount = 0;
1482 int64_t currentAlignmentOffset = mBGZF->Tell();
1483 int32_t blockRefId = 0;
1484 int32_t blockMaxEndPosition = 0;
1485 int64_t blockStartOffset = currentAlignmentOffset;
1486 int32_t blockStartPosition = -1;
1488 // plow through alignments, storing index entries
1490 while ( reader->GetNextAlignmentCore(al) ) {
1492 // if block contains data (not the first time through) AND alignment is on a new reference
1493 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
1495 // store previous data
1496 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1497 SaveOffsetEntry(blockRefId, entry);
1499 // intialize new block for current alignment's reference
1500 currentBlockCount = 0;
1501 blockMaxEndPosition = al.GetEndPosition();
1502 blockStartOffset = currentAlignmentOffset;
1505 // if beginning of block, save first alignment's refID & position
1506 if ( currentBlockCount == 0 ) {
1507 blockRefId = al.RefID;
1508 blockStartPosition = al.Position;
1511 // increment block counter
1512 ++currentBlockCount;
1514 // check end position
1515 int32_t alignmentEndPosition = al.GetEndPosition();
1516 if ( alignmentEndPosition > blockMaxEndPosition )
1517 blockMaxEndPosition = alignmentEndPosition;
1519 // if block is full, get offset for next block, reset currentBlockCount
1520 if ( currentBlockCount == m_blockSize ) {
1521 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1522 SaveOffsetEntry(blockRefId, entry);
1523 blockStartOffset = mBGZF->Tell();
1524 currentBlockCount = 0;
1527 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
1528 // necessary because we won't know if this next alignment is on a new reference until we actually read it
1529 currentAlignmentOffset = mBGZF->Tell();
1532 // store final block with data
1533 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1534 SaveOffsetEntry(blockRefId, entry);
1537 m_hasFullDataCache = true;
1539 // return success/failure of rewind
1540 return reader->Rewind();
1543 // check index file magic number, return true if OK
1544 bool BamToolsIndex::BamToolsIndexPrivate::CheckMagicNumber(void) {
1546 // see if index is valid BAM index
1548 size_t elementsRead = fread(magic, 1, 4, m_parent->m_indexStream);
1549 if ( elementsRead != 4 ) return false;
1550 if ( strncmp(magic, "BTI\1", 4) != 0 ) {
1551 fprintf(stderr, "Problem with index file - invalid format.\n");
1559 // check index file version, return true if OK
1560 bool BamToolsIndex::BamToolsIndexPrivate::CheckVersion(void) {
1562 // read version from file
1563 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_parent->m_indexStream);
1564 if ( elementsRead != 1 ) return false;
1565 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
1567 // if version is negative, or zero
1568 if ( m_inputVersion <= 0 ) {
1569 fprintf(stderr, "Problem with index file - invalid version.\n");
1573 // if version is newer than can be supported by this version of bamtools
1574 else if ( m_inputVersion > m_outputVersion ) {
1575 fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
1576 fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
1580 // ------------------------------------------------------------------
1581 // check for deprecated, unsupported versions
1582 // (typically whose format did not accomodate a particular bug fix)
1584 else if ( (Version)m_inputVersion == BTI_1_0 ) {
1585 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
1586 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1590 else if ( (Version)m_inputVersion == BTI_1_1 ) {
1591 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
1592 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1600 // clear all current index offset data in memory
1601 void BamToolsIndex::BamToolsIndexPrivate::ClearAllData(void) {
1602 BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
1603 BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
1604 for ( ; indexIter != indexEnd; ++indexIter ) {
1605 const int& refId = (*indexIter).first;
1606 ClearReferenceOffsets(refId);
1610 // clear all index offset data for desired reference
1611 void BamToolsIndex::BamToolsIndexPrivate::ClearReferenceOffsets(const int& refId) {
1612 if ( m_indexData.find(refId) == m_indexData.end() ) return;
1613 vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
1615 m_hasFullDataCache = false;
1618 // return file position after header metadata
1619 const off_t BamToolsIndex::BamToolsIndexPrivate::DataBeginOffset(void) const {
1620 return m_dataBeginOffset;
1623 // calculate BAM file offset for desired region
1624 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1625 // check @hasAlignmentsInRegion to determine this status
1626 // @region - target region
1627 // @offset - resulting seek target
1628 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1629 // N.B. - ignores isRightBoundSpecified
1630 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
1632 // return false if leftBound refID is not found in index data
1633 BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
1634 if ( indexIter == m_indexData.end()) return false;
1636 // load index data for region if not already cached
1637 if ( !IsDataLoaded(region.LeftRefID) ) {
1638 bool loadedOk = true;
1639 loadedOk &= SkipToReference(region.LeftRefID);
1640 loadedOk &= LoadReference(region.LeftRefID);
1641 if ( !loadedOk ) return false;
1644 // localize index data for this reference (& sanity check that data actually exists)
1645 indexIter = m_indexData.find(region.LeftRefID);
1646 if ( indexIter == m_indexData.end()) return false;
1647 const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
1648 if ( referenceOffsets.empty() ) return false;
1650 // -------------------------------------------------------
1651 // calculate nearest index to jump to
1653 // save first offset
1654 offset = (*referenceOffsets.begin()).StartOffset;
1656 // iterate over offsets entries on this reference
1657 vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
1658 vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end();
1659 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1660 const BamToolsIndexEntry& entry = (*offsetIter);
1661 // break if alignment 'entry' overlaps region
1662 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
1663 offset = (*offsetIter).StartOffset;
1666 // set flag based on whether an index entry was found for this region
1667 *hasAlignmentsInRegion = ( offsetIter != offsetEnd );
1669 // if cache mode set to none, dump the data we just loaded
1670 if (m_parent->m_cacheMode == BamIndex::NoIndexCaching )
1671 ClearReferenceOffsets(region.LeftRefID);
1677 // returns whether reference has alignments or no
1678 bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& refId) const {
1680 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
1681 if ( indexIter == m_indexData.end()) return false;
1682 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
1683 return refEntry.HasAlignments;
1686 // return true if all index data is cached
1687 bool BamToolsIndex::BamToolsIndexPrivate::HasFullDataCache(void) const {
1688 return m_hasFullDataCache;
1691 // returns true if index cache has data for desired reference
1692 bool BamToolsIndex::BamToolsIndexPrivate::IsDataLoaded(const int& refId) const {
1694 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
1695 if ( indexIter == m_indexData.end()) return false;
1696 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
1698 if ( !refEntry.HasAlignments ) return true; // no data period
1700 // return whether offsets list contains data
1701 return !refEntry.Offsets.empty();
1704 // attempts to use index to jump to region; returns success/fail
1705 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
1708 *hasAlignmentsInRegion = false;
1710 // localize parent data
1711 if ( m_parent == 0 ) return false;
1712 BamReader* reader = m_parent->m_reader;
1713 BgzfData* mBGZF = m_parent->m_BGZF;
1714 RefVector& references = m_parent->m_references;
1716 // check valid BamReader state
1717 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
1718 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
1722 // make sure left-bound position is valid
1723 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
1726 // calculate nearest offset to jump to
1728 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
1729 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
1733 // return success/failure of seek
1734 return mBGZF->Seek(offset);
1737 // clears index data from all references except the first
1738 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
1739 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1740 KeepOnlyReferenceOffsets( (*indexBegin).first );
1743 // clears index data from all references except the one specified
1744 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
1745 BamToolsIndexData::iterator mapIter = m_indexData.begin();
1746 BamToolsIndexData::iterator mapEnd = m_indexData.end();
1747 for ( ; mapIter != mapEnd; ++mapIter ) {
1748 const int entryRefId = (*mapIter).first;
1749 if ( entryRefId != refId )
1750 ClearReferenceOffsets(entryRefId);
1754 // load index data for all references, return true if loaded OK
1755 bool BamToolsIndex::BamToolsIndexPrivate::LoadAllReferences(bool saveData) {
1757 // skip if data already loaded
1758 if ( m_hasFullDataCache ) return true;
1760 // read in number of references
1761 int32_t numReferences;
1762 if ( !LoadReferenceCount(numReferences) ) return false;
1763 //SetReferenceCount(numReferences);
1765 // iterate over reference entries
1766 bool loadedOk = true;
1767 for ( int i = 0; i < numReferences; ++i )
1768 loadedOk &= LoadReference(i, saveData);
1771 if ( loadedOk && saveData )
1772 m_hasFullDataCache = true;
1774 // return success/failure of load
1778 // load header data from index file, return true if loaded OK
1779 bool BamToolsIndex::BamToolsIndexPrivate::LoadHeader(void) {
1781 // check magic number
1782 if ( !CheckMagicNumber() ) return false;
1784 // check BTI version
1785 if ( !CheckVersion() ) return false;
1787 // read in block size
1788 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_parent->m_indexStream);
1789 if ( elementsRead != 1 ) return false;
1790 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
1792 // store offset of beginning of data
1793 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1795 // return success/failure of load
1796 return (elementsRead == 1);
1799 // load a single index entry from file, return true if loaded OK
1800 // @saveData - save data in memory if true, just read & discard if false
1801 bool BamToolsIndex::BamToolsIndexPrivate::LoadIndexEntry(const int& refId, bool saveData) {
1803 // read in index entry data members
1804 size_t elementsRead = 0;
1805 BamToolsIndexEntry entry;
1806 elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_parent->m_indexStream);
1807 elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_parent->m_indexStream);
1808 elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_parent->m_indexStream);
1809 if ( elementsRead != 3 ) {
1810 cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
1814 // swap endian-ness if necessary
1815 if ( m_isBigEndian ) {
1816 SwapEndian_32(entry.MaxEndPosition);
1817 SwapEndian_64(entry.StartOffset);
1818 SwapEndian_32(entry.StartPosition);
1823 SaveOffsetEntry(refId, entry);
1825 // return success/failure of load
1829 // load a single reference from file, return true if loaded OK
1830 // @saveData - save data in memory if true, just read & discard if false
1831 bool BamToolsIndex::BamToolsIndexPrivate::LoadFirstReference(bool saveData) {
1832 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1833 return LoadReference( (*indexBegin).first, saveData );
1836 // load a single reference from file, return true if loaded OK
1837 // @saveData - save data in memory if true, just read & discard if false
1838 bool BamToolsIndex::BamToolsIndexPrivate::LoadReference(const int& refId, bool saveData) {
1840 // read in number of offsets for this reference
1841 uint32_t numOffsets;
1842 size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1843 if ( elementsRead != 1 ) return false;
1844 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1846 // initialize offsets container for this reference
1847 SetOffsetCount(refId, (int)numOffsets);
1849 // iterate over offset entries
1850 for ( unsigned int j = 0; j < numOffsets; ++j )
1851 LoadIndexEntry(refId, saveData);
1853 // return success/failure of load
1857 // loads number of references, return true if loaded OK
1858 bool BamToolsIndex::BamToolsIndexPrivate::LoadReferenceCount(int& numReferences) {
1860 size_t elementsRead = 0;
1862 // read reference count
1863 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1864 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1866 // return success/failure of load
1867 return ( elementsRead == 1 );
1870 // saves an index offset entry in memory
1871 void BamToolsIndex::BamToolsIndexPrivate::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
1872 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1873 refEntry.HasAlignments = true;
1874 refEntry.Offsets.push_back(entry);
1877 // pre-allocates size for offset vector
1878 void BamToolsIndex::BamToolsIndexPrivate::SetOffsetCount(const int& refId, const int& offsetCount) {
1879 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1880 refEntry.Offsets.reserve(offsetCount);
1881 refEntry.HasAlignments = ( offsetCount > 0);
1884 // initializes index data structure to hold @count references
1885 void BamToolsIndex::BamToolsIndexPrivate::SetReferenceCount(const int& count) {
1886 for ( int i = 0; i < count; ++i )
1887 m_indexData[i].HasAlignments = false;
1890 // position file pointer to first reference begin, return true if skipped OK
1891 bool BamToolsIndex::BamToolsIndexPrivate::SkipToFirstReference(void) {
1892 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1893 return SkipToReference( (*indexBegin).first );
1896 // position file pointer to desired reference begin, return true if skipped OK
1897 bool BamToolsIndex::BamToolsIndexPrivate::SkipToReference(const int& refId) {
1900 if ( !m_parent->Rewind() ) return false;
1902 // read in number of references
1903 int32_t numReferences;
1904 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1905 if ( elementsRead != 1 ) return false;
1906 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1908 // iterate over reference entries
1909 bool skippedOk = true;
1910 int currentRefId = 0;
1911 while (currentRefId != refId) {
1912 skippedOk &= LoadReference(currentRefId, false);
1916 // return success/failure of skip
1920 // write header to new index file
1921 bool BamToolsIndex::BamToolsIndexPrivate::WriteHeader(void) {
1923 size_t elementsWritten = 0;
1925 // write BTI index format 'magic number'
1926 elementsWritten += fwrite("BTI\1", 1, 4, m_parent->m_indexStream);
1928 // write BTI index format version
1929 int32_t currentVersion = (int32_t)m_outputVersion;
1930 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
1931 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_parent->m_indexStream);
1934 int32_t blockSize = m_blockSize;
1935 if ( m_isBigEndian ) SwapEndian_32(blockSize);
1936 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_parent->m_indexStream);
1938 // store offset of beginning of data
1939 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1941 // return success/failure of write
1942 return ( elementsWritten == 6 );
1945 // write index data for all references to new index file
1946 bool BamToolsIndex::BamToolsIndexPrivate::WriteAllReferences(void) {
1948 size_t elementsWritten = 0;
1950 // write number of references
1951 int32_t numReferences = (int32_t)m_indexData.size();
1952 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1953 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1955 // iterate through references in index
1957 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1958 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1959 for ( ; refIter != refEnd; ++refIter )
1960 refOk &= WriteReferenceEntry( (*refIter).second );
1962 return ( (elementsWritten == 1) && refOk );
1965 // write current reference index data to new index file
1966 bool BamToolsIndex::BamToolsIndexPrivate::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
1968 size_t elementsWritten = 0;
1970 // write number of offsets listed for this reference
1971 uint32_t numOffsets = refEntry.Offsets.size();
1972 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1973 elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1975 // iterate over offset entries
1976 bool entriesOk = true;
1977 vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
1978 vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
1979 for ( ; offsetIter != offsetEnd; ++offsetIter )
1980 entriesOk &= WriteIndexEntry( (*offsetIter) );
1982 return ( (elementsWritten == 1) && entriesOk );
1985 // write current index offset entry to new index file
1986 bool BamToolsIndex::BamToolsIndexPrivate::WriteIndexEntry(const BamToolsIndexEntry& entry) {
1989 int32_t maxEndPosition = entry.MaxEndPosition;
1990 int64_t startOffset = entry.StartOffset;
1991 int32_t startPosition = entry.StartPosition;
1993 // swap endian-ness if necessary
1994 if ( m_isBigEndian ) {
1995 SwapEndian_32(maxEndPosition);
1996 SwapEndian_64(startOffset);
1997 SwapEndian_32(startPosition);
2000 // write the reference index entry
2001 size_t elementsWritten = 0;
2002 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_parent->m_indexStream);
2003 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_parent->m_indexStream);
2004 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_parent->m_indexStream);
2005 return ( elementsWritten == 3 );
2008 // ---------------------------------------------------
2009 // BamToolsIndex implementation
2011 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
2012 : BamIndex(bgzf, reader)
2014 d = new BamToolsIndexPrivate(this);
2017 BamToolsIndex::~BamToolsIndex(void) {
2022 // BamIndex interface implementation
2023 bool BamToolsIndex::Build(void) { return d->Build(); }
2024 void BamToolsIndex::ClearAllData(void) { d->ClearAllData(); }
2025 const off_t BamToolsIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
2026 bool BamToolsIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
2027 bool BamToolsIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
2028 bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
2029 void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
2030 bool BamToolsIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
2031 bool BamToolsIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
2032 bool BamToolsIndex::LoadHeader(void) { return d->LoadHeader(); }
2033 bool BamToolsIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
2034 bool BamToolsIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
2035 bool BamToolsIndex::WriteHeader(void) { return d->WriteHeader(); }