1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 October 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
28 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader)
31 , m_cacheMode(BamIndex::LimitedIndexCaching)
34 if ( m_reader && m_reader->IsOpen() )
35 m_references = m_reader->GetReferenceData();
39 BamIndex::~BamIndex(void) {
41 fclose(m_indexStream);
44 // return true if FILE* is open
45 bool BamIndex::IsOpen(void) const {
46 return ( m_indexStream != 0 );
49 // loads existing data from file into memory
50 bool BamIndex::Load(const string& filename) {
52 // open index file, abort on error
53 if ( !OpenIndexFile(filename, "rb") ) {
54 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
59 if ( !LoadHeader() ) {
60 fclose(m_indexStream);
64 // load reference data (but only keep in memory if full caching requested)
65 bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
66 if ( !LoadAllReferences(saveInitialLoad) ) {
67 fclose(m_indexStream);
71 // update index cache based on selected mode
78 // opens index file for reading/writing, return true if opened OK
79 bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
80 m_indexStream = fopen(filename.c_str(), mode.c_str());
81 return ( m_indexStream != 0 );
84 // rewind index file to beginning of index data, return true if rewound OK
85 bool BamIndex::Rewind(void) {
86 return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
89 // change the index caching behavior
90 void BamIndex::SetCacheMode(const BamIndexCacheMode mode) {
91 if ( mode != m_cacheMode ) {
97 // updates in-memory cache of index data, depending on current cache mode
98 void BamIndex::UpdateCache(void) {
100 // skip if file not open
101 if ( !IsOpen() ) return;
103 // reflect requested cache mode behavior
104 switch ( m_cacheMode ) {
106 case (BamIndex::FullIndexCaching) :
108 LoadAllReferences(true);
111 case (BamIndex::LimitedIndexCaching) :
112 if ( HasFullDataCache() )
113 KeepOnlyFirstReferenceOffsets();
116 SkipToFirstReference();
117 LoadFirstReference(true);
120 case(BamIndex::NoIndexCaching) :
129 // writes in-memory index data out to file
130 bool BamIndex::Write(const string& bamFilename) {
132 // open index file for writing
133 string indexFilename = bamFilename + Extension();
134 if ( !OpenIndexFile(indexFilename, "wb") ) {
135 fprintf(stderr, "ERROR: Could not open file to save index.\n");
139 // write index header data
140 if ( !WriteHeader() ) {
141 fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n");
142 fflush(m_indexStream);
143 fclose(m_indexStream);
147 // write main index data
148 if ( !WriteAllReferences() ) {
149 fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n");
150 fflush(m_indexStream);
151 fclose(m_indexStream);
155 // flush any remaining output, rewind file, and return success
156 fflush(m_indexStream);
157 fclose(m_indexStream);
159 // re-open index file for later reading
160 if ( !OpenIndexFile(indexFilename, "rb") ) {
161 fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n");
165 // return success/failure of write
169 // #########################################################################################
170 // #########################################################################################
172 // -------------------------------
173 // BamStandardIndex structs & typedefs
177 // BAM index constants
178 const int MAX_BIN = 37450; // =(8^6-1)/7+1
179 const int BAM_LIDX_SHIFT = 14;
181 // --------------------------------------------------
182 // BamStandardIndex data structures & typedefs
190 Chunk(const uint64_t& start = 0,
191 const uint64_t& stop = 0)
197 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
198 return lhs.Start < rhs.Start;
201 typedef vector<Chunk> ChunkVector;
202 typedef map<uint32_t, ChunkVector> BamBinMap;
203 typedef vector<uint64_t> LinearOffsetVector;
205 struct ReferenceIndex {
209 LinearOffsetVector Offsets;
213 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
214 const LinearOffsetVector& offsets = LinearOffsetVector(),
215 const bool hasAlignments = false)
218 , HasAlignments(hasAlignments)
222 typedef map<int32_t, ReferenceIndex> BamStandardIndexData;
224 } // namespace BamTools
226 // -------------------------------
227 // BamStandardIndexPrivate implementation
229 struct BamStandardIndex::BamStandardIndexPrivate {
232 BamStandardIndex* m_parent;
235 BamStandardIndexData m_indexData;
236 off_t m_dataBeginOffset;
237 bool m_hasFullDataCache;
241 BamStandardIndexPrivate(BamStandardIndex* parent);
242 ~BamStandardIndexPrivate(void);
244 // parent interface methods
247 // creates index data (in-memory) from current reader data
249 // clear all current index offset data in memory
250 void ClearAllData(void);
251 // return file position after header metadata
252 const off_t DataBeginOffset(void) const;
253 // returns whether reference has alignments or no
254 bool HasAlignments(const int& referenceID) const;
255 // return true if all index data is cached
256 bool HasFullDataCache(void) const;
257 // attempts to use index to jump to region; returns success/fail
258 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
259 // clears index data from all references except the first reference
260 void KeepOnlyFirstReferenceOffsets(void);
261 // load index data for all references, return true if loaded OK
262 // @saveData - save data in memory if true, just read & discard if false
263 bool LoadAllReferences(bool saveData = true);
264 // load first reference from file, return true if loaded OK
265 // @saveData - save data in memory if true, just read & discard if false
266 bool LoadFirstReference(bool saveData = true);
267 // load header data from index file, return true if loaded OK
268 bool LoadHeader(void);
269 // position file pointer to first reference begin, return true if skipped OK
270 bool SkipToFirstReference(void);
271 // write header to new index file
272 bool WriteHeader(void);
273 // write index data for all references to new index file
274 bool WriteAllReferences(void);
279 // -----------------------
280 // index file operations
282 // check index file magic number, return true if OK
283 bool CheckMagicNumber(void);
284 // check index file version, return true if OK
285 bool CheckVersion(void);
286 // load a single index bin entry from file, return true if loaded OK
287 // @saveData - save data in memory if true, just read & discard if false
288 bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
289 bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
290 // load a single index bin entry from file, return true if loaded OK
291 // @saveData - save data in memory if true, just read & discard if false
292 bool LoadChunk(ChunkVector& chunks, bool saveData = true);
293 bool LoadChunks(ChunkVector& chunks, bool saveData = true);
294 // load a single index linear offset entry from file, return true if loaded OK
295 // @saveData - save data in memory if true, just read & discard if false
296 bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
297 // load a single reference from file, return true if loaded OK
298 // @saveData - save data in memory if true, just read & discard if false
299 bool LoadReference(const int& refId, bool saveData = true);
300 // loads number of references, return true if loaded OK
301 bool LoadReferenceCount(int& numReferences);
302 // position file pointer to desired reference begin, return true if skipped OK
303 bool SkipToReference(const int& refId);
304 // write index data for bin to new index file
305 bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
306 // write index data for bins to new index file
307 bool WriteBins(const BamBinMap& bins);
308 // write index data for chunk entry to new index file
309 bool WriteChunk(const Chunk& chunk);
310 // write index data for chunk entry to new index file
311 bool WriteChunks(const ChunkVector& chunks);
312 // write index data for linear offsets entry to new index file
313 bool WriteLinearOffsets(const LinearOffsetVector& offsets);
314 // write index data single reference to new index file
315 bool WriteReference(const ReferenceIndex& refEntry);
317 // -----------------------
318 // index data operations
320 // calculate bins that overlap region
321 int BinsFromRegion(const BamRegion& region,
322 const bool isRightBoundSpecified,
323 uint16_t bins[BamTools::MAX_BIN]);
324 // clear all index offset data for desired reference
325 void ClearReferenceOffsets(const int& refId);
326 // calculates offset(s) for a given region
327 bool GetOffsets(const BamRegion& region,
328 const bool isRightBoundSpecified,
329 vector<int64_t>& offsets,
330 bool* hasAlignmentsInRegion);
331 // returns true if index cache has data for desired reference
332 bool IsDataLoaded(const int& refId) const;
333 // clears index data from all references except the one specified
334 void KeepOnlyReferenceOffsets(const int& refId);
335 // simplifies index by merging 'chunks'
336 void MergeChunks(void);
337 // saves BAM bin entry for index
338 void SaveBinEntry(BamBinMap& binMap,
339 const uint32_t& saveBin,
340 const uint64_t& saveOffset,
341 const uint64_t& lastOffset);
342 // saves linear offset entry for index
343 void SaveLinearOffset(LinearOffsetVector& offsets,
344 const BamAlignment& bAlignment,
345 const uint64_t& lastOffset);
346 // initializes index data structure to hold @count references
347 void SetReferenceCount(const int& count);
351 BamStandardIndex::BamStandardIndexPrivate::BamStandardIndexPrivate(BamStandardIndex* parent)
353 , m_dataBeginOffset(0)
354 , m_hasFullDataCache(false)
356 m_isBigEndian = BamTools::SystemIsBigEndian();
359 BamStandardIndex::BamStandardIndexPrivate::~BamStandardIndexPrivate(void) {
363 // calculate bins that overlap region
364 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region,
365 const bool isRightBoundSpecified,
366 uint16_t bins[MAX_BIN])
368 // get region boundaries
369 uint32_t begin = (unsigned int)region.LeftPosition;
372 // if right bound specified AND left&right bounds are on same reference
373 // OK to use right bound position
374 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
375 end = (unsigned int)region.RightPosition;
377 // otherwise, use end of left bound reference as cutoff
379 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
381 // initialize list, bin '0' always a valid bin
385 // get rest of bins that contain this region
387 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
388 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
389 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
390 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
391 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
393 // return number of bins stored
397 // creates index data (in-memory) from current reader data
398 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
400 // localize parent data
401 if ( m_parent == 0 ) return false;
402 BamReader* reader = m_parent->m_reader;
403 BgzfData* mBGZF = m_parent->m_BGZF;
405 // be sure reader & BGZF file are valid & open for reading
406 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
409 // move file pointer to beginning of alignments
412 // get reference count, reserve index space
413 const int numReferences = (int)m_parent->m_references.size();
415 m_hasFullDataCache = false;
416 SetReferenceCount(numReferences);
418 // sets default constant for bin, ID, offset, coordinate variables
419 const uint32_t defaultValue = 0xffffffffu;
422 uint32_t saveBin(defaultValue);
423 uint32_t lastBin(defaultValue);
426 int32_t saveRefID(defaultValue);
427 int32_t lastRefID(defaultValue);
430 uint64_t saveOffset = mBGZF->Tell();
431 uint64_t lastOffset = saveOffset;
434 int32_t lastCoordinate = defaultValue;
436 BamAlignment bAlignment;
437 while ( reader->GetNextAlignmentCore(bAlignment) ) {
439 // change of chromosome, save ID, reset bin
440 if ( lastRefID != bAlignment.RefID ) {
441 lastRefID = bAlignment.RefID;
442 lastBin = defaultValue;
445 // if lastCoordinate greater than BAM position - file not sorted properly
446 else if ( lastCoordinate > bAlignment.Position ) {
447 fprintf(stderr, "BAM file not properly sorted:\n");
448 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(),
449 lastCoordinate, bAlignment.Position, bAlignment.RefID);
453 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
454 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
456 // save linear offset entry (matched to BAM entry refID)
457 BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID);
458 if ( indexIter == m_indexData.end() ) return false; // error
459 ReferenceIndex& refIndex = (*indexIter).second;
460 LinearOffsetVector& offsets = refIndex.Offsets;
461 SaveLinearOffset(offsets, bAlignment, lastOffset);
464 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
465 if ( bAlignment.Bin != lastBin ) {
467 // if not first time through
468 if ( saveBin != defaultValue ) {
470 // save Bam bin entry
471 BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
472 if ( indexIter == m_indexData.end() ) return false; // error
473 ReferenceIndex& refIndex = (*indexIter).second;
474 BamBinMap& binMap = refIndex.Bins;
475 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
479 saveOffset = lastOffset;
482 saveBin = bAlignment.Bin;
483 lastBin = bAlignment.Bin;
486 saveRefID = bAlignment.RefID;
488 // if invalid RefID, break out
489 if ( saveRefID < 0 ) break;
492 // make sure that current file pointer is beyond lastOffset
493 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
494 fprintf(stderr, "Error in BGZF offsets.\n");
499 lastOffset = mBGZF->Tell();
501 // update lastCoordinate
502 lastCoordinate = bAlignment.Position;
505 // save any leftover BAM data (as long as refID is valid)
506 if ( saveRefID >= 0 ) {
507 // save Bam bin entry
508 BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
509 if ( indexIter == m_indexData.end() ) return false; // error
510 ReferenceIndex& refIndex = (*indexIter).second;
511 BamBinMap& binMap = refIndex.Bins;
512 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
515 // simplify index by merging chunks
518 // iterate through references in index
519 // sort offsets in linear offset vector
520 BamStandardIndexData::iterator indexIter = m_indexData.begin();
521 BamStandardIndexData::iterator indexEnd = m_indexData.end();
522 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
524 // get reference index data
525 ReferenceIndex& refIndex = (*indexIter).second;
526 LinearOffsetVector& offsets = refIndex.Offsets;
528 // sort linear offsets
529 sort(offsets.begin(), offsets.end());
532 // rewind file pointer to beginning of alignments, return success/fail
533 return reader->Rewind();
536 // check index file magic number, return true if OK
537 bool BamStandardIndex::BamStandardIndexPrivate::CheckMagicNumber(void) {
539 // read in magic number
541 size_t elementsRead = fread(magic, sizeof(char), 4, m_parent->m_indexStream);
543 // compare to expected value
544 if ( strncmp(magic, "BAI\1", 4) != 0 ) {
545 fprintf(stderr, "Problem with index file - invalid format.\n");
546 fclose(m_parent->m_indexStream);
550 // return success/failure of load
551 return (elementsRead == 4);
554 // clear all current index offset data in memory
555 void BamStandardIndex::BamStandardIndexPrivate::ClearAllData(void) {
556 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
557 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
558 for ( ; indexIter != indexEnd; ++indexIter ) {
559 const int& refId = (*indexIter).first;
560 ClearReferenceOffsets(refId);
564 // clear all index offset data for desired reference
565 void BamStandardIndex::BamStandardIndexPrivate::ClearReferenceOffsets(const int& refId) {
567 // look up refId, skip if not found
568 BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
569 if ( indexIter == m_indexData.end() ) return ;
571 // clear reference data
572 ReferenceIndex& refEntry = (*indexIter).second;
573 refEntry.Bins.clear();
574 refEntry.Offsets.clear();
577 m_hasFullDataCache = false;
580 // return file position after header metadata
581 const off_t BamStandardIndex::BamStandardIndexPrivate::DataBeginOffset(void) const {
582 return m_dataBeginOffset;
585 // calculates offset(s) for a given region
586 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region,
587 const bool isRightBoundSpecified,
588 vector<int64_t>& offsets,
589 bool* hasAlignmentsInRegion)
591 // return false if leftBound refID is not found in index data
592 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
595 // load index data for region if not already cached
596 if ( !IsDataLoaded(region.LeftRefID) ) {
597 bool loadedOk = true;
598 loadedOk &= SkipToReference(region.LeftRefID);
599 loadedOk &= LoadReference(region.LeftRefID);
600 if ( !loadedOk ) return false;
603 // calculate which bins overlap this region
604 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
605 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
607 // get bins for this reference
608 BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
609 if ( indexIter == m_indexData.end() ) return false; // error
610 const ReferenceIndex& refIndex = (*indexIter).second;
611 const BamBinMap& binMap = refIndex.Bins;
613 // get minimum offset to consider
614 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
615 const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
616 ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
618 // store all alignment 'chunk' starts (file offsets) for bins in this region
619 for ( int i = 0; i < numBins; ++i ) {
621 const uint16_t binKey = bins[i];
622 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
623 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
625 // iterate over chunks
626 const ChunkVector& chunks = (*binIter).second;
627 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
628 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
629 for ( ; chunksIter != chunksEnd; ++chunksIter) {
631 // if valid chunk found, store its file offset
632 const Chunk& chunk = (*chunksIter);
633 if ( chunk.Stop > minOffset )
634 offsets.push_back( chunk.Start );
642 // sort the offsets before returning
643 sort(offsets.begin(), offsets.end());
645 // set flag & return success
646 *hasAlignmentsInRegion = (offsets.size() != 0 );
648 // if cache mode set to none, dump the data we just loaded
649 if (m_parent->m_cacheMode == BamIndex::NoIndexCaching )
650 ClearReferenceOffsets(region.LeftRefID);
656 // returns whether reference has alignments or no
657 bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& refId) const {
658 BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
659 if ( indexIter == m_indexData.end() ) return false; // error
660 const ReferenceIndex& refEntry = (*indexIter).second;
661 return refEntry.HasAlignments;
664 // return true if all index data is cached
665 bool BamStandardIndex::BamStandardIndexPrivate::HasFullDataCache(void) const {
666 return m_hasFullDataCache;
669 // returns true if index cache has data for desired reference
670 bool BamStandardIndex::BamStandardIndexPrivate::IsDataLoaded(const int& refId) const {
672 // look up refId, return false if not found
673 BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
674 if ( indexIter == m_indexData.end() ) return false;
676 // see if reference has alignments
677 // if not, it's not a problem to have no offset data
678 const ReferenceIndex& refEntry = (*indexIter).second;
679 if ( !refEntry.HasAlignments ) return true;
681 // return whether bin map contains data
682 return ( !refEntry.Bins.empty() );
685 // attempts to use index to jump to region; returns success/fail
686 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
688 // localize parent data
689 if ( m_parent == 0 ) return false;
690 BamReader* reader = m_parent->m_reader;
691 BgzfData* mBGZF = m_parent->m_BGZF;
692 RefVector& references = m_parent->m_references;
694 // be sure reader & BGZF file are valid & open for reading
695 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
698 // make sure left-bound position is valid
699 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
702 // calculate offsets for this region
703 // if failed, print message, set flag, and return failure
704 vector<int64_t> offsets;
705 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
706 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
707 *hasAlignmentsInRegion = false;
711 // iterate through offsets
712 BamAlignment bAlignment;
714 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
716 // attempt seek & load first available alignment
717 // set flag to true if data exists
718 result &= mBGZF->Seek(*o);
719 *hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment);
721 // if this alignment corresponds to desired position
722 // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
723 if ( ((bAlignment.RefID == region.LeftRefID) &&
724 ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) ||
725 (bAlignment.RefID > region.LeftRefID) )
727 if ( o != offsets.begin() ) --o;
728 return mBGZF->Seek(*o);
732 // if error in jumping, print message & set flag
734 fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
735 *hasAlignmentsInRegion = false;
738 // return success/failure
742 // clears index data from all references except the first
743 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
744 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
745 KeepOnlyReferenceOffsets((*indexBegin).first);
748 // clears index data from all references except the one specified
749 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
750 BamStandardIndexData::iterator mapIter = m_indexData.begin();
751 BamStandardIndexData::iterator mapEnd = m_indexData.end();
752 for ( ; mapIter != mapEnd; ++mapIter ) {
753 const int entryRefId = (*mapIter).first;
754 if ( entryRefId != refId )
755 ClearReferenceOffsets(entryRefId);
759 bool BamStandardIndex::BamStandardIndexPrivate::LoadAllReferences(bool saveData) {
761 // skip if data already loaded
762 if ( m_hasFullDataCache ) return true;
764 // get number of reference sequences
765 uint32_t numReferences;
766 if ( !LoadReferenceCount((int&)numReferences) )
769 // iterate over reference entries
770 bool loadedOk = true;
771 for ( int i = 0; i < (int)numReferences; ++i )
772 loadedOk &= LoadReference(i, saveData);
775 if ( loadedOk && saveData )
776 m_hasFullDataCache = true;
778 // return success/failure of loading references
782 // load header data from index file, return true if loaded OK
783 bool BamStandardIndex::BamStandardIndexPrivate::LoadHeader(void) {
785 bool loadedOk = CheckMagicNumber();
787 // store offset of beginning of data
788 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
790 // return success/failure of load
794 // load a single index bin entry from file, return true if loaded OK
795 // @saveData - save data in memory if true, just read & discard if false
796 bool BamStandardIndex::BamStandardIndexPrivate::LoadBin(ReferenceIndex& refEntry, bool saveData) {
798 size_t elementsRead = 0;
802 elementsRead += fread(&binId, sizeof(binId), 1, m_parent->m_indexStream);
803 if ( m_isBigEndian ) SwapEndian_32(binId);
805 // load alignment chunks for this bin
807 bool chunksOk = LoadChunks(chunks, saveData);
810 if ( chunksOk && saveData )
811 refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
813 // return success/failure of load
814 return ( (elementsRead == 1) && chunksOk );
817 bool BamStandardIndex::BamStandardIndexPrivate::LoadBins(ReferenceIndex& refEntry, bool saveData) {
819 size_t elementsRead = 0;
821 // get number of bins
823 elementsRead += fread(&numBins, sizeof(numBins), 1, m_parent->m_indexStream);
824 if ( m_isBigEndian ) SwapEndian_32(numBins);
827 refEntry.HasAlignments = ( numBins != 0 );
831 for ( int i = 0; i < numBins; ++i )
832 binsOk &= LoadBin(refEntry, saveData);
834 // return success/failure of load
835 return ( (elementsRead == 1) && binsOk );
838 // load a single index bin entry from file, return true if loaded OK
839 // @saveData - save data in memory if true, just read & discard if false
840 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunk(ChunkVector& chunks, bool saveData) {
842 size_t elementsRead = 0;
844 // read in chunk data
847 elementsRead += fread(&start, sizeof(start), 1, m_parent->m_indexStream);
848 elementsRead += fread(&stop, sizeof(stop), 1, m_parent->m_indexStream);
850 // swap endian-ness if necessary
851 if ( m_isBigEndian ) {
852 SwapEndian_64(start);
856 // save data if requested
857 if ( saveData ) chunks.push_back( Chunk(start, stop) );
859 // return success/failure of load
860 return ( elementsRead == 2 );
863 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunks(ChunkVector& chunks, bool saveData) {
865 size_t elementsRead = 0;
867 // read in number of chunks
869 elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_parent->m_indexStream);
870 if ( m_isBigEndian ) SwapEndian_32(numChunks);
872 // initialize space for chunks if we're storing this data
873 if ( saveData ) chunks.reserve(numChunks);
875 // iterate over chunks
876 bool chunksOk = true;
877 for ( int i = 0; i < (int)numChunks; ++i )
878 chunksOk &= LoadChunk(chunks, saveData);
881 sort( chunks.begin(), chunks.end(), ChunkLessThan );
883 // return success/failure of load
884 return ( (elementsRead == 1) && chunksOk );
887 // load a single index linear offset entry from file, return true if loaded OK
888 // @saveData - save data in memory if true, just read & discard if false
889 bool BamStandardIndex::BamStandardIndexPrivate::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
891 size_t elementsRead = 0;
893 // read in number of linear offsets
894 int32_t numLinearOffsets;
895 elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_parent->m_indexStream);
896 if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
898 // set up destination vector (if we're saving the data)
899 LinearOffsetVector linearOffsets;
900 if ( saveData ) linearOffsets.reserve(numLinearOffsets);
902 // iterate over linear offsets
903 uint64_t linearOffset;
904 for ( int i = 0; i < numLinearOffsets; ++i ) {
905 elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
906 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
907 if ( saveData ) linearOffsets.push_back(linearOffset);
910 // sort linear offsets
911 sort ( linearOffsets.begin(), linearOffsets.end() );
913 // save in reference index entry if desired
914 if ( saveData ) refEntry.Offsets = linearOffsets;
916 // return success/failure of load
917 return ( elementsRead == (size_t)(numLinearOffsets + 1) );
920 bool BamStandardIndex::BamStandardIndexPrivate::LoadFirstReference(bool saveData) {
921 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
922 return LoadReference((*indexBegin).first, saveData);
925 // load a single reference from file, return true if loaded OK
926 // @saveData - save data in memory if true, just read & discard if false
927 bool BamStandardIndex::BamStandardIndexPrivate::LoadReference(const int& refId, bool saveData) {
930 BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
932 // if reference not previously loaded, create new entry
933 if ( indexIter == m_indexData.end() ) {
934 ReferenceIndex newEntry;
935 newEntry.HasAlignments = false;
936 m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
939 // load reference data
940 indexIter = m_indexData.find(refId);
941 ReferenceIndex& entry = (*indexIter).second;
942 bool loadedOk = true;
943 loadedOk &= LoadBins(entry, saveData);
944 loadedOk &= LoadLinearOffsets(entry, saveData);
948 // loads number of references, return true if loaded OK
949 bool BamStandardIndex::BamStandardIndexPrivate::LoadReferenceCount(int& numReferences) {
951 size_t elementsRead = 0;
953 // read reference count
954 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
955 if ( m_isBigEndian ) SwapEndian_32(numReferences);
957 // return success/failure of load
958 return ( elementsRead == 1 );
961 // merges 'alignment chunks' in BAM bin (used for index building)
962 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
964 // iterate over reference enties
965 BamStandardIndexData::iterator indexIter = m_indexData.begin();
966 BamStandardIndexData::iterator indexEnd = m_indexData.end();
967 for ( ; indexIter != indexEnd; ++indexIter ) {
969 // get BAM bin map for this reference
970 ReferenceIndex& refIndex = (*indexIter).second;
971 BamBinMap& bamBinMap = refIndex.Bins;
973 // iterate over BAM bins
974 BamBinMap::iterator binIter = bamBinMap.begin();
975 BamBinMap::iterator binEnd = bamBinMap.end();
976 for ( ; binIter != binEnd; ++binIter ) {
978 // get chunk vector for this bin
979 ChunkVector& binChunks = (*binIter).second;
980 if ( binChunks.size() == 0 ) continue;
982 ChunkVector mergedChunks;
983 mergedChunks.push_back( binChunks[0] );
985 // iterate over chunks
987 ChunkVector::iterator chunkIter = binChunks.begin();
988 ChunkVector::iterator chunkEnd = binChunks.end();
989 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
991 // get 'currentChunk' based on numeric index
992 Chunk& currentChunk = mergedChunks[i];
994 // get iteratorChunk based on vector iterator
995 Chunk& iteratorChunk = (*chunkIter);
997 // if chunk ends where (iterator) chunk starts, then merge
998 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
999 currentChunk.Stop = iteratorChunk.Stop;
1003 // set currentChunk + 1 to iteratorChunk
1004 mergedChunks.push_back(iteratorChunk);
1009 // saved merged chunk vector
1010 (*binIter).second = mergedChunks;
1015 // saves BAM bin entry for index
1016 void BamStandardIndex::BamStandardIndexPrivate::SaveBinEntry(BamBinMap& binMap,
1017 const uint32_t& saveBin,
1018 const uint64_t& saveOffset,
1019 const uint64_t& lastOffset)
1022 BamBinMap::iterator binIter = binMap.find(saveBin);
1025 Chunk newChunk(saveOffset, lastOffset);
1027 // if entry doesn't exist
1028 if ( binIter == binMap.end() ) {
1029 ChunkVector newChunks;
1030 newChunks.push_back(newChunk);
1031 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
1036 ChunkVector& binChunks = (*binIter).second;
1037 binChunks.push_back( newChunk );
1041 // saves linear offset entry for index
1042 void BamStandardIndex::BamStandardIndexPrivate::SaveLinearOffset(LinearOffsetVector& offsets,
1043 const BamAlignment& bAlignment,
1044 const uint64_t& lastOffset)
1046 // get converted offsets
1047 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
1048 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
1050 // resize vector if necessary
1051 int oldSize = offsets.size();
1052 int newSize = endOffset + 1;
1053 if ( oldSize < newSize )
1054 offsets.resize(newSize, 0);
1057 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
1058 if ( offsets[i] == 0 )
1059 offsets[i] = lastOffset;
1063 // initializes index data structure to hold @count references
1064 void BamStandardIndex::BamStandardIndexPrivate::SetReferenceCount(const int& count) {
1065 for ( int i = 0; i < count; ++i )
1066 m_indexData[i].HasAlignments = false;
1069 bool BamStandardIndex::BamStandardIndexPrivate::SkipToFirstReference(void) {
1070 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
1071 return SkipToReference( (*indexBegin).first );
1074 // position file pointer to desired reference begin, return true if skipped OK
1075 bool BamStandardIndex::BamStandardIndexPrivate::SkipToReference(const int& refId) {
1078 if ( !m_parent->Rewind() ) return false;
1080 // read in number of references
1081 uint32_t numReferences;
1082 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1083 if ( elementsRead != 1 ) return false;
1084 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1086 // iterate over reference entries
1087 bool skippedOk = true;
1088 int currentRefId = 0;
1089 while (currentRefId != refId) {
1090 skippedOk &= LoadReference(currentRefId, false);
1098 // write header to new index file
1099 bool BamStandardIndex::BamStandardIndexPrivate::WriteHeader(void) {
1101 size_t elementsWritten = 0;
1103 // write magic number
1104 elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_parent->m_indexStream);
1106 // store offset of beginning of data
1107 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1109 // return success/failure of write
1110 return (elementsWritten == 4);
1113 // write index data for all references to new index file
1114 bool BamStandardIndex::BamStandardIndexPrivate::WriteAllReferences(void) {
1116 size_t elementsWritten = 0;
1118 // write number of reference sequences
1119 int32_t numReferenceSeqs = m_indexData.size();
1120 if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
1121 elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_parent->m_indexStream);
1123 // iterate over reference sequences
1125 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
1126 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
1127 for ( ; indexIter != indexEnd; ++ indexIter )
1128 refsOk &= WriteReference( (*indexIter).second );
1130 // return success/failure of write
1131 return ( (elementsWritten == 1) && refsOk );
1134 // write index data for bin to new index file
1135 bool BamStandardIndex::BamStandardIndexPrivate::WriteBin(const uint32_t& binId, const ChunkVector& chunks) {
1137 size_t elementsWritten = 0;
1140 uint32_t binKey = binId;
1141 if ( m_isBigEndian ) SwapEndian_32(binKey);
1142 elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_parent->m_indexStream);
1145 bool chunksOk = WriteChunks(chunks);
1147 // return success/failure of write
1148 return ( (elementsWritten == 1) && chunksOk );
1151 // write index data for bins to new index file
1152 bool BamStandardIndex::BamStandardIndexPrivate::WriteBins(const BamBinMap& bins) {
1154 size_t elementsWritten = 0;
1156 // write number of bins
1157 int32_t binCount = bins.size();
1158 if ( m_isBigEndian ) SwapEndian_32(binCount);
1159 elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_parent->m_indexStream);
1161 // iterate over bins
1163 BamBinMap::const_iterator binIter = bins.begin();
1164 BamBinMap::const_iterator binEnd = bins.end();
1165 for ( ; binIter != binEnd; ++binIter )
1166 binsOk &= WriteBin( (*binIter).first, (*binIter).second );
1168 // return success/failure of write
1169 return ( (elementsWritten == 1) && binsOk );
1172 // write index data for chunk entry to new index file
1173 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunk(const Chunk& chunk) {
1175 size_t elementsWritten = 0;
1177 // localize alignment chunk offsets
1178 uint64_t start = chunk.Start;
1179 uint64_t stop = chunk.Stop;
1181 // swap endian-ness if necessary
1182 if ( m_isBigEndian ) {
1183 SwapEndian_64(start);
1184 SwapEndian_64(stop);
1187 // write to index file
1188 elementsWritten += fwrite(&start, sizeof(start), 1, m_parent->m_indexStream);
1189 elementsWritten += fwrite(&stop, sizeof(stop), 1, m_parent->m_indexStream);
1191 // return success/failure of write
1192 return ( elementsWritten == 2 );
1195 // write index data for chunk entry to new index file
1196 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunks(const ChunkVector& chunks) {
1198 size_t elementsWritten = 0;
1201 int32_t chunkCount = chunks.size();
1202 if ( m_isBigEndian ) SwapEndian_32(chunkCount);
1203 elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_parent->m_indexStream);
1205 // iterate over chunks
1206 bool chunksOk = true;
1207 ChunkVector::const_iterator chunkIter = chunks.begin();
1208 ChunkVector::const_iterator chunkEnd = chunks.end();
1209 for ( ; chunkIter != chunkEnd; ++chunkIter )
1210 chunksOk &= WriteChunk( (*chunkIter) );
1212 // return success/failure of write
1213 return ( (elementsWritten == 1) && chunksOk );
1216 // write index data for linear offsets entry to new index file
1217 bool BamStandardIndex::BamStandardIndexPrivate::WriteLinearOffsets(const LinearOffsetVector& offsets) {
1219 size_t elementsWritten = 0;
1221 // write number of linear offsets
1222 int32_t offsetCount = offsets.size();
1223 if ( m_isBigEndian ) SwapEndian_32(offsetCount);
1224 elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_parent->m_indexStream);
1226 // iterate over linear offsets
1227 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
1228 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
1229 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1231 // write linear offset
1232 uint64_t linearOffset = (*offsetIter);
1233 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
1234 elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
1237 // return success/failure of write
1238 return ( elementsWritten == (size_t)(offsetCount + 1) );
1241 // write index data for a single reference to new index file
1242 bool BamStandardIndex::BamStandardIndexPrivate::WriteReference(const ReferenceIndex& refEntry) {
1244 refOk &= WriteBins(refEntry.Bins);
1245 refOk &= WriteLinearOffsets(refEntry.Offsets);
1249 // ---------------------------------------------------------------
1250 // BamStandardIndex implementation
1252 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader)
1253 : BamIndex(bgzf, reader)
1255 d = new BamStandardIndexPrivate(this);
1258 BamStandardIndex::~BamStandardIndex(void) {
1263 // BamIndex interface implementation
1264 bool BamStandardIndex::Build(void) { return d->Build(); }
1265 void BamStandardIndex::ClearAllData(void) { d->ClearAllData(); }
1266 const off_t BamStandardIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
1267 bool BamStandardIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
1268 bool BamStandardIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
1269 bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
1270 void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
1271 bool BamStandardIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
1272 bool BamStandardIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
1273 bool BamStandardIndex::LoadHeader(void) { return d->LoadHeader(); }
1274 bool BamStandardIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
1275 bool BamStandardIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
1276 bool BamStandardIndex::WriteHeader(void) { return d->WriteHeader(); }
1278 // #########################################################################################
1279 // #########################################################################################
1281 // ---------------------------------------------------
1282 // BamToolsIndex structs & typedefs
1284 namespace BamTools {
1286 // individual index offset entry
1287 struct BamToolsIndexEntry {
1290 int32_t MaxEndPosition;
1291 int64_t StartOffset;
1292 int32_t StartPosition;
1295 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
1296 const int64_t& startOffset = 0,
1297 const int32_t& startPosition = 0)
1298 : MaxEndPosition(maxEndPosition)
1299 , StartOffset(startOffset)
1300 , StartPosition(startPosition)
1304 // reference index entry
1305 struct BamToolsReferenceEntry {
1309 vector<BamToolsIndexEntry> Offsets;
1312 BamToolsReferenceEntry(void)
1313 : HasAlignments(false)
1317 // the actual index data structure
1318 typedef map<int, BamToolsReferenceEntry> BamToolsIndexData;
1320 } // namespace BamTools
1322 // ---------------------------------------------------
1323 // BamToolsIndexPrivate implementation
1325 struct BamToolsIndex::BamToolsIndexPrivate {
1327 // keep a list of any supported versions here
1328 // (might be useful later to handle any 'legacy' versions if the format changes)
1329 // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
1331 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
1333 // if ( indexVersion >= BTI_1_2 )
1337 enum Version { BTI_1_0 = 1
1343 BamToolsIndex* m_parent;
1346 int32_t m_blockSize;
1347 BamToolsIndexData m_indexData;
1348 off_t m_dataBeginOffset;
1349 bool m_hasFullDataCache;
1351 int32_t m_inputVersion; // Version is serialized as int
1352 Version m_outputVersion;
1355 BamToolsIndexPrivate(BamToolsIndex* parent);
1356 ~BamToolsIndexPrivate(void);
1358 // parent interface methods
1361 // creates index data (in-memory) from current reader data
1363 // clear all current index offset data in memory
1364 void ClearAllData(void);
1365 // return file position after header metadata
1366 const off_t DataBeginOffset(void) const;
1367 // returns whether reference has alignments or no
1368 bool HasAlignments(const int& referenceID) const;
1369 // return true if all index data is cached
1370 bool HasFullDataCache(void) const;
1371 // attempts to use index to jump to region; returns success/fail
1372 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
1373 // clears index data from all references except the first
1374 void KeepOnlyFirstReferenceOffsets(void);
1375 // load index data for all references, return true if loaded OK
1376 // @saveData - save data in memory if true, just read & discard if false
1377 bool LoadAllReferences(bool saveData = true);
1378 // load first reference from file, return true if loaded OK
1379 // @saveData - save data in memory if true, just read & discard if false
1380 bool LoadFirstReference(bool saveData = true);
1381 // load header data from index file, return true if loaded OK
1382 bool LoadHeader(void);
1383 // position file pointer to desired reference begin, return true if skipped OK
1384 bool SkipToFirstReference(void);
1385 // write header to new index file
1386 bool WriteHeader(void);
1387 // write index data for all references to new index file
1388 bool WriteAllReferences(void);
1393 // -----------------------
1394 // index file operations
1396 // check index file magic number, return true if OK
1397 bool CheckMagicNumber(void);
1398 // check index file version, return true if OK
1399 bool CheckVersion(void);
1400 // return true if FILE* is open
1401 bool IsOpen(void) const;
1402 // load a single index entry from file, return true if loaded OK
1403 // @saveData - save data in memory if true, just read & discard if false
1404 bool LoadIndexEntry(const int& refId, bool saveData = true);
1405 // load a single reference from file, return true if loaded OK
1406 // @saveData - save data in memory if true, just read & discard if false
1407 bool LoadReference(const int& refId, bool saveData = true);
1408 // loads number of references, return true if loaded OK
1409 bool LoadReferenceCount(int& numReferences);
1410 // position file pointer to desired reference begin, return true if skipped OK
1411 bool SkipToReference(const int& refId);
1412 // write current reference index data to new index file
1413 bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry);
1414 // write current index offset entry to new index file
1415 bool WriteIndexEntry(const BamToolsIndexEntry& entry);
1417 // -----------------------
1418 // index data operations
1420 // clear all index offset data for desired reference
1421 void ClearReferenceOffsets(const int& refId);
1422 // calculate BAM file offset for desired region
1423 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1424 // check @hasAlignmentsInRegion to determine this status
1425 // @region - target region
1426 // @offset - resulting seek target
1427 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1428 bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
1429 // returns true if index cache has data for desired reference
1430 bool IsDataLoaded(const int& refId) const;
1431 // clears index data from all references except the one specified
1432 void KeepOnlyReferenceOffsets(const int& refId);
1433 // saves an index offset entry in memory
1434 void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry);
1435 // pre-allocates size for offset vector
1436 void SetOffsetCount(const int& refId, const int& offsetCount);
1437 // initializes index data structure to hold @count references
1438 void SetReferenceCount(const int& count);
1442 BamToolsIndex::BamToolsIndexPrivate::BamToolsIndexPrivate(BamToolsIndex* parent)
1445 , m_dataBeginOffset(0)
1446 , m_hasFullDataCache(false)
1448 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
1450 m_isBigEndian = BamTools::SystemIsBigEndian();
1454 BamToolsIndex::BamToolsIndexPrivate::~BamToolsIndexPrivate(void) {
1458 // creates index data (in-memory) from current reader data
1459 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
1461 // localize parent data
1462 if ( m_parent == 0 ) return false;
1463 BamReader* reader = m_parent->m_reader;
1464 BgzfData* mBGZF = m_parent->m_BGZF;
1466 // be sure reader & BGZF file are valid & open for reading
1467 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
1470 // move file pointer to beginning of alignments
1471 if ( !reader->Rewind() ) return false;
1473 // initialize index data structure with space for all references
1474 const int numReferences = (int)m_parent->m_references.size();
1475 m_indexData.clear();
1476 m_hasFullDataCache = false;
1477 SetReferenceCount(numReferences);
1479 // set up counters and markers
1480 int32_t currentBlockCount = 0;
1481 int64_t currentAlignmentOffset = mBGZF->Tell();
1482 int32_t blockRefId = 0;
1483 int32_t blockMaxEndPosition = 0;
1484 int64_t blockStartOffset = currentAlignmentOffset;
1485 int32_t blockStartPosition = -1;
1487 // plow through alignments, storing index entries
1489 while ( reader->GetNextAlignmentCore(al) ) {
1491 // if block contains data (not the first time through) AND alignment is on a new reference
1492 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
1494 // store previous data
1495 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1496 SaveOffsetEntry(blockRefId, entry);
1498 // intialize new block for current alignment's reference
1499 currentBlockCount = 0;
1500 blockMaxEndPosition = al.GetEndPosition();
1501 blockStartOffset = currentAlignmentOffset;
1504 // if beginning of block, save first alignment's refID & position
1505 if ( currentBlockCount == 0 ) {
1506 blockRefId = al.RefID;
1507 blockStartPosition = al.Position;
1510 // increment block counter
1511 ++currentBlockCount;
1513 // check end position
1514 int32_t alignmentEndPosition = al.GetEndPosition();
1515 if ( alignmentEndPosition > blockMaxEndPosition )
1516 blockMaxEndPosition = alignmentEndPosition;
1518 // if block is full, get offset for next block, reset currentBlockCount
1519 if ( currentBlockCount == m_blockSize ) {
1520 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1521 SaveOffsetEntry(blockRefId, entry);
1522 blockStartOffset = mBGZF->Tell();
1523 currentBlockCount = 0;
1526 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
1527 // necessary because we won't know if this next alignment is on a new reference until we actually read it
1528 currentAlignmentOffset = mBGZF->Tell();
1531 // store final block with data
1532 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1533 SaveOffsetEntry(blockRefId, entry);
1536 m_hasFullDataCache = true;
1538 // return success/failure of rewind
1539 return reader->Rewind();
1542 // check index file magic number, return true if OK
1543 bool BamToolsIndex::BamToolsIndexPrivate::CheckMagicNumber(void) {
1545 // see if index is valid BAM index
1547 size_t elementsRead = fread(magic, 1, 4, m_parent->m_indexStream);
1548 if ( elementsRead != 4 ) return false;
1549 if ( strncmp(magic, "BTI\1", 4) != 0 ) {
1550 fprintf(stderr, "Problem with index file - invalid format.\n");
1558 // check index file version, return true if OK
1559 bool BamToolsIndex::BamToolsIndexPrivate::CheckVersion(void) {
1561 // read version from file
1562 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_parent->m_indexStream);
1563 if ( elementsRead != 1 ) return false;
1564 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
1566 // if version is negative, or zero
1567 if ( m_inputVersion <= 0 ) {
1568 fprintf(stderr, "Problem with index file - invalid version.\n");
1572 // if version is newer than can be supported by this version of bamtools
1573 else if ( m_inputVersion > m_outputVersion ) {
1574 fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
1575 fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
1579 // ------------------------------------------------------------------
1580 // check for deprecated, unsupported versions
1581 // (typically whose format did not accomodate a particular bug fix)
1583 else if ( (Version)m_inputVersion == BTI_1_0 ) {
1584 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
1585 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1589 else if ( (Version)m_inputVersion == BTI_1_1 ) {
1590 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
1591 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1599 // clear all current index offset data in memory
1600 void BamToolsIndex::BamToolsIndexPrivate::ClearAllData(void) {
1601 BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
1602 BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
1603 for ( ; indexIter != indexEnd; ++indexIter ) {
1604 const int& refId = (*indexIter).first;
1605 ClearReferenceOffsets(refId);
1609 // clear all index offset data for desired reference
1610 void BamToolsIndex::BamToolsIndexPrivate::ClearReferenceOffsets(const int& refId) {
1611 if ( m_indexData.find(refId) == m_indexData.end() ) return;
1612 vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
1614 m_hasFullDataCache = false;
1617 // return file position after header metadata
1618 const off_t BamToolsIndex::BamToolsIndexPrivate::DataBeginOffset(void) const {
1619 return m_dataBeginOffset;
1622 // calculate BAM file offset for desired region
1623 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1624 // check @hasAlignmentsInRegion to determine this status
1625 // @region - target region
1626 // @offset - resulting seek target
1627 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1628 // N.B. - ignores isRightBoundSpecified
1629 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
1631 // return false if leftBound refID is not found in index data
1632 BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
1633 if ( indexIter == m_indexData.end()) return false;
1635 // load index data for region if not already cached
1636 if ( !IsDataLoaded(region.LeftRefID) ) {
1637 bool loadedOk = true;
1638 loadedOk &= SkipToReference(region.LeftRefID);
1639 loadedOk &= LoadReference(region.LeftRefID);
1640 if ( !loadedOk ) return false;
1643 // localize index data for this reference (& sanity check that data actually exists)
1644 indexIter = m_indexData.find(region.LeftRefID);
1645 if ( indexIter == m_indexData.end()) return false;
1646 const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
1647 if ( referenceOffsets.empty() ) return false;
1649 // -------------------------------------------------------
1650 // calculate nearest index to jump to
1652 // save first offset
1653 offset = (*referenceOffsets.begin()).StartOffset;
1655 // iterate over offsets entries on this reference
1656 vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
1657 vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end();
1658 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1659 const BamToolsIndexEntry& entry = (*offsetIter);
1660 // break if alignment 'entry' overlaps region
1661 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
1662 offset = (*offsetIter).StartOffset;
1665 // set flag based on whether an index entry was found for this region
1666 *hasAlignmentsInRegion = ( offsetIter != offsetEnd );
1668 // if cache mode set to none, dump the data we just loaded
1669 if (m_parent->m_cacheMode == BamIndex::NoIndexCaching )
1670 ClearReferenceOffsets(region.LeftRefID);
1676 // returns whether reference has alignments or no
1677 bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& refId) const {
1679 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
1680 if ( indexIter == m_indexData.end()) return false;
1681 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
1682 return refEntry.HasAlignments;
1685 // return true if all index data is cached
1686 bool BamToolsIndex::BamToolsIndexPrivate::HasFullDataCache(void) const {
1687 return m_hasFullDataCache;
1690 // returns true if index cache has data for desired reference
1691 bool BamToolsIndex::BamToolsIndexPrivate::IsDataLoaded(const int& refId) const {
1693 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
1694 if ( indexIter == m_indexData.end()) return false;
1695 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
1697 if ( !refEntry.HasAlignments ) return true; // no data period
1699 // return whether offsets list contains data
1700 return !refEntry.Offsets.empty();
1703 // attempts to use index to jump to region; returns success/fail
1704 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
1707 *hasAlignmentsInRegion = false;
1709 // localize parent data
1710 if ( m_parent == 0 ) return false;
1711 BamReader* reader = m_parent->m_reader;
1712 BgzfData* mBGZF = m_parent->m_BGZF;
1713 RefVector& references = m_parent->m_references;
1715 // check valid BamReader state
1716 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
1717 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
1721 // make sure left-bound position is valid
1722 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
1725 // calculate nearest offset to jump to
1727 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
1728 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
1732 // return success/failure of seek
1733 return mBGZF->Seek(offset);
1736 // clears index data from all references except the first
1737 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
1738 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1739 KeepOnlyReferenceOffsets( (*indexBegin).first );
1742 // clears index data from all references except the one specified
1743 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
1744 BamToolsIndexData::iterator mapIter = m_indexData.begin();
1745 BamToolsIndexData::iterator mapEnd = m_indexData.end();
1746 for ( ; mapIter != mapEnd; ++mapIter ) {
1747 const int entryRefId = (*mapIter).first;
1748 if ( entryRefId != refId )
1749 ClearReferenceOffsets(entryRefId);
1753 // load index data for all references, return true if loaded OK
1754 bool BamToolsIndex::BamToolsIndexPrivate::LoadAllReferences(bool saveData) {
1756 // skip if data already loaded
1757 if ( m_hasFullDataCache ) return true;
1759 // read in number of references
1760 int32_t numReferences;
1761 if ( !LoadReferenceCount(numReferences) ) return false;
1762 //SetReferenceCount(numReferences);
1764 // iterate over reference entries
1765 bool loadedOk = true;
1766 for ( int i = 0; i < numReferences; ++i )
1767 loadedOk &= LoadReference(i, saveData);
1770 if ( loadedOk && saveData )
1771 m_hasFullDataCache = true;
1773 // return success/failure of load
1777 // load header data from index file, return true if loaded OK
1778 bool BamToolsIndex::BamToolsIndexPrivate::LoadHeader(void) {
1780 // check magic number
1781 if ( !CheckMagicNumber() ) return false;
1783 // check BTI version
1784 if ( !CheckVersion() ) return false;
1786 // read in block size
1787 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_parent->m_indexStream);
1788 if ( elementsRead != 1 ) return false;
1789 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
1791 // store offset of beginning of data
1792 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1794 // return success/failure of load
1795 return (elementsRead == 1);
1798 // load a single index entry from file, return true if loaded OK
1799 // @saveData - save data in memory if true, just read & discard if false
1800 bool BamToolsIndex::BamToolsIndexPrivate::LoadIndexEntry(const int& refId, bool saveData) {
1802 // read in index entry data members
1803 size_t elementsRead = 0;
1804 BamToolsIndexEntry entry;
1805 elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_parent->m_indexStream);
1806 elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_parent->m_indexStream);
1807 elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_parent->m_indexStream);
1808 if ( elementsRead != 3 ) {
1809 cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
1813 // swap endian-ness if necessary
1814 if ( m_isBigEndian ) {
1815 SwapEndian_32(entry.MaxEndPosition);
1816 SwapEndian_64(entry.StartOffset);
1817 SwapEndian_32(entry.StartPosition);
1822 SaveOffsetEntry(refId, entry);
1824 // return success/failure of load
1828 // load a single reference from file, return true if loaded OK
1829 // @saveData - save data in memory if true, just read & discard if false
1830 bool BamToolsIndex::BamToolsIndexPrivate::LoadFirstReference(bool saveData) {
1831 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1832 return LoadReference( (*indexBegin).first, saveData );
1835 // load a single reference from file, return true if loaded OK
1836 // @saveData - save data in memory if true, just read & discard if false
1837 bool BamToolsIndex::BamToolsIndexPrivate::LoadReference(const int& refId, bool saveData) {
1839 // read in number of offsets for this reference
1840 uint32_t numOffsets;
1841 size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1842 if ( elementsRead != 1 ) return false;
1843 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1845 // initialize offsets container for this reference
1846 SetOffsetCount(refId, (int)numOffsets);
1848 // iterate over offset entries
1849 for ( unsigned int j = 0; j < numOffsets; ++j )
1850 LoadIndexEntry(refId, saveData);
1852 // return success/failure of load
1856 // loads number of references, return true if loaded OK
1857 bool BamToolsIndex::BamToolsIndexPrivate::LoadReferenceCount(int& numReferences) {
1859 size_t elementsRead = 0;
1861 // read reference count
1862 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1863 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1865 // return success/failure of load
1866 return ( elementsRead == 1 );
1869 // saves an index offset entry in memory
1870 void BamToolsIndex::BamToolsIndexPrivate::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
1871 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1872 refEntry.HasAlignments = true;
1873 refEntry.Offsets.push_back(entry);
1876 // pre-allocates size for offset vector
1877 void BamToolsIndex::BamToolsIndexPrivate::SetOffsetCount(const int& refId, const int& offsetCount) {
1878 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1879 refEntry.Offsets.reserve(offsetCount);
1880 refEntry.HasAlignments = ( offsetCount > 0);
1883 // initializes index data structure to hold @count references
1884 void BamToolsIndex::BamToolsIndexPrivate::SetReferenceCount(const int& count) {
1885 for ( int i = 0; i < count; ++i )
1886 m_indexData[i].HasAlignments = false;
1889 // position file pointer to first reference begin, return true if skipped OK
1890 bool BamToolsIndex::BamToolsIndexPrivate::SkipToFirstReference(void) {
1891 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1892 return SkipToReference( (*indexBegin).first );
1895 // position file pointer to desired reference begin, return true if skipped OK
1896 bool BamToolsIndex::BamToolsIndexPrivate::SkipToReference(const int& refId) {
1899 if ( !m_parent->Rewind() ) return false;
1901 // read in number of references
1902 int32_t numReferences;
1903 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1904 if ( elementsRead != 1 ) return false;
1905 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1907 // iterate over reference entries
1908 bool skippedOk = true;
1909 int currentRefId = 0;
1910 while (currentRefId != refId) {
1911 skippedOk &= LoadReference(currentRefId, false);
1915 // return success/failure of skip
1919 // write header to new index file
1920 bool BamToolsIndex::BamToolsIndexPrivate::WriteHeader(void) {
1922 size_t elementsWritten = 0;
1924 // write BTI index format 'magic number'
1925 elementsWritten += fwrite("BTI\1", 1, 4, m_parent->m_indexStream);
1927 // write BTI index format version
1928 int32_t currentVersion = (int32_t)m_outputVersion;
1929 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
1930 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_parent->m_indexStream);
1933 int32_t blockSize = m_blockSize;
1934 if ( m_isBigEndian ) SwapEndian_32(blockSize);
1935 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_parent->m_indexStream);
1937 // store offset of beginning of data
1938 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1940 // return success/failure of write
1941 return ( elementsWritten == 6 );
1944 // write index data for all references to new index file
1945 bool BamToolsIndex::BamToolsIndexPrivate::WriteAllReferences(void) {
1947 size_t elementsWritten = 0;
1949 // write number of references
1950 int32_t numReferences = (int32_t)m_indexData.size();
1951 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1952 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1954 // iterate through references in index
1956 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1957 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1958 for ( ; refIter != refEnd; ++refIter )
1959 refOk &= WriteReferenceEntry( (*refIter).second );
1961 return ( (elementsWritten == 1) && refOk );
1964 // write current reference index data to new index file
1965 bool BamToolsIndex::BamToolsIndexPrivate::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
1967 size_t elementsWritten = 0;
1969 // write number of offsets listed for this reference
1970 uint32_t numOffsets = refEntry.Offsets.size();
1971 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1972 elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1974 // iterate over offset entries
1975 bool entriesOk = true;
1976 vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
1977 vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
1978 for ( ; offsetIter != offsetEnd; ++offsetIter )
1979 entriesOk &= WriteIndexEntry( (*offsetIter) );
1981 return ( (elementsWritten == 1) && entriesOk );
1984 // write current index offset entry to new index file
1985 bool BamToolsIndex::BamToolsIndexPrivate::WriteIndexEntry(const BamToolsIndexEntry& entry) {
1988 int32_t maxEndPosition = entry.MaxEndPosition;
1989 int64_t startOffset = entry.StartOffset;
1990 int32_t startPosition = entry.StartPosition;
1992 // swap endian-ness if necessary
1993 if ( m_isBigEndian ) {
1994 SwapEndian_32(maxEndPosition);
1995 SwapEndian_64(startOffset);
1996 SwapEndian_32(startPosition);
1999 // write the reference index entry
2000 size_t elementsWritten = 0;
2001 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_parent->m_indexStream);
2002 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_parent->m_indexStream);
2003 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_parent->m_indexStream);
2004 return ( elementsWritten == 3 );
2007 // ---------------------------------------------------
2008 // BamToolsIndex implementation
2010 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
2011 : BamIndex(bgzf, reader)
2013 d = new BamToolsIndexPrivate(this);
2016 BamToolsIndex::~BamToolsIndex(void) {
2021 // BamIndex interface implementation
2022 bool BamToolsIndex::Build(void) { return d->Build(); }
2023 void BamToolsIndex::ClearAllData(void) { d->ClearAllData(); }
2024 const off_t BamToolsIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
2025 bool BamToolsIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
2026 bool BamToolsIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
2027 bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
2028 void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
2029 bool BamToolsIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
2030 bool BamToolsIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
2031 bool BamToolsIndex::LoadHeader(void) { return d->LoadHeader(); }
2032 bool BamToolsIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
2033 bool BamToolsIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
2034 bool BamToolsIndex::WriteHeader(void) { return d->WriteHeader(); }