1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 9 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
27 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
30 , m_isBigEndian(isBigEndian)
32 if ( m_reader && m_reader->IsOpen() )
33 m_references = m_reader->GetReferenceData();
36 bool BamIndex::HasAlignments(const int& referenceID) {
38 // return false if invalid ID
39 if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
42 // else return status of reference (has alignments?)
44 return m_references.at(referenceID).RefHasAlignments;
47 // #########################################################################################
48 // #########################################################################################
50 // -------------------------------
51 // BamStandardIndex structs & typedefs
55 // --------------------------------------------------
56 // BamStandardIndex data structures & typedefs
64 Chunk(const uint64_t& start = 0,
65 const uint64_t& stop = 0)
71 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
72 return lhs.Start < rhs.Start;
75 typedef vector<Chunk> ChunkVector;
76 typedef map<uint32_t, ChunkVector> BamBinMap;
77 typedef vector<uint64_t> LinearOffsetVector;
79 struct ReferenceIndex {
83 LinearOffsetVector Offsets;
86 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
87 const LinearOffsetVector& offsets = LinearOffsetVector())
93 typedef vector<ReferenceIndex> BamStandardIndexData;
95 } // namespace BamTools
97 // -------------------------------
98 // BamStandardIndex implementation
100 struct BamStandardIndex::BamStandardIndexPrivate {
102 // -------------------------
105 BamStandardIndexData m_indexData;
106 BamStandardIndex* m_parent;
108 // -------------------------
111 BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { }
112 ~BamStandardIndexPrivate(void) { m_indexData.clear(); }
114 // -------------------------
117 // calculate bins that overlap region
118 int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
119 // saves BAM bin entry for index
120 void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
121 // saves linear offset entry for index
122 void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
123 // simplifies index by merging 'chunks'
124 void MergeChunks(void);
128 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
129 : BamIndex(bgzf, reader, isBigEndian)
131 d = new BamStandardIndexPrivate(this);
134 BamStandardIndex::~BamStandardIndex(void) {
139 // calculate bins that overlap region
140 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
142 // get region boundaries
143 uint32_t begin = (unsigned int)region.LeftPosition;
146 // if right bound specified AND left&right bounds are on same reference
147 // OK to use right bound position
148 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
149 end = (unsigned int)region.RightPosition;
151 // otherwise, use end of left bound reference as cutoff
153 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
155 // initialize list, bin '0' always a valid bin
159 // get rest of bins that contain this region
161 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
162 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
163 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
164 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
165 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
167 // return number of bins stored
171 bool BamStandardIndex::Build(void) {
173 // be sure reader & BGZF file are valid & open for reading
174 if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
177 // move file pointer to beginning of alignments
180 // get reference count, reserve index space
181 int numReferences = (int)m_references.size();
182 for ( int i = 0; i < numReferences; ++i ) {
183 d->m_indexData.push_back(ReferenceIndex());
186 // sets default constant for bin, ID, offset, coordinate variables
187 const uint32_t defaultValue = 0xffffffffu;
190 uint32_t saveBin(defaultValue);
191 uint32_t lastBin(defaultValue);
194 int32_t saveRefID(defaultValue);
195 int32_t lastRefID(defaultValue);
198 uint64_t saveOffset = m_BGZF->Tell();
199 uint64_t lastOffset = saveOffset;
202 int32_t lastCoordinate = defaultValue;
204 BamAlignment bAlignment;
205 while ( m_reader->GetNextAlignmentCore(bAlignment) ) {
207 // change of chromosome, save ID, reset bin
208 if ( lastRefID != bAlignment.RefID ) {
209 lastRefID = bAlignment.RefID;
210 lastBin = defaultValue;
213 // if lastCoordinate greater than BAM position - file not sorted properly
214 else if ( lastCoordinate > bAlignment.Position ) {
215 fprintf(stderr, "BAM file not properly sorted:\n");
216 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
220 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
221 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
223 // save linear offset entry (matched to BAM entry refID)
224 ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID);
225 LinearOffsetVector& offsets = refIndex.Offsets;
226 d->InsertLinearOffset(offsets, bAlignment, lastOffset);
229 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
230 if ( bAlignment.Bin != lastBin ) {
232 // if not first time through
233 if ( saveBin != defaultValue ) {
235 // save Bam bin entry
236 ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
237 BamBinMap& binMap = refIndex.Bins;
238 d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
242 saveOffset = lastOffset;
245 saveBin = bAlignment.Bin;
246 lastBin = bAlignment.Bin;
249 saveRefID = bAlignment.RefID;
251 // if invalid RefID, break out
252 if ( saveRefID < 0 ) break;
255 // make sure that current file pointer is beyond lastOffset
256 if ( m_BGZF->Tell() <= (int64_t)lastOffset ) {
257 fprintf(stderr, "Error in BGZF offsets.\n");
262 lastOffset = m_BGZF->Tell();
264 // update lastCoordinate
265 lastCoordinate = bAlignment.Position;
268 // save any leftover BAM data (as long as refID is valid)
269 if ( saveRefID >= 0 ) {
270 // save Bam bin entry
271 ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
272 BamBinMap& binMap = refIndex.Bins;
273 d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
276 // simplify index by merging chunks
279 // iterate through references in index
280 // store whether reference has data &
281 // sort offsets in linear offset vector
282 BamStandardIndexData::iterator indexIter = d->m_indexData.begin();
283 BamStandardIndexData::iterator indexEnd = d->m_indexData.end();
284 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
286 // get reference index data
287 ReferenceIndex& refIndex = (*indexIter);
288 BamBinMap& binMap = refIndex.Bins;
289 LinearOffsetVector& offsets = refIndex.Offsets;
291 // store whether reference has alignments or no
292 m_references[i].RefHasAlignments = ( binMap.size() > 0 );
294 // sort linear offsets
295 sort(offsets.begin(), offsets.end());
298 // rewind file pointer to beginning of alignments, return success/fail
299 return m_reader->Rewind();
302 bool BamStandardIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
304 // calculate which bins overlap this region
305 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
306 int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins);
308 // get bins for this reference
309 const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID);
310 const BamBinMap& binMap = refIndex.Bins;
312 // get minimum offset to consider
313 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
314 uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
316 // store all alignment 'chunk' starts (file offsets) for bins in this region
317 for ( int i = 0; i < numBins; ++i ) {
319 const uint16_t binKey = bins[i];
320 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
321 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
323 // iterate over chunks
324 const ChunkVector& chunks = (*binIter).second;
325 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
326 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
327 for ( ; chunksIter != chunksEnd; ++chunksIter) {
329 // if valid chunk found, store its file offset
330 const Chunk& chunk = (*chunksIter);
331 if ( chunk.Stop > minOffset )
332 offsets.push_back( chunk.Start );
340 // sort the offsets before returning
341 sort(offsets.begin(), offsets.end());
343 // return whether any offsets were found
344 return ( offsets.size() != 0 );
347 // saves BAM bin entry for index
348 void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap,
349 const uint32_t& saveBin,
350 const uint64_t& saveOffset,
351 const uint64_t& lastOffset)
354 BamBinMap::iterator binIter = binMap.find(saveBin);
357 Chunk newChunk(saveOffset, lastOffset);
359 // if entry doesn't exist
360 if ( binIter == binMap.end() ) {
361 ChunkVector newChunks;
362 newChunks.push_back(newChunk);
363 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
368 ChunkVector& binChunks = (*binIter).second;
369 binChunks.push_back( newChunk );
373 // saves linear offset entry for index
374 void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
375 const BamAlignment& bAlignment,
376 const uint64_t& lastOffset)
378 // get converted offsets
379 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
380 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
382 // resize vector if necessary
383 int oldSize = offsets.size();
384 int newSize = endOffset + 1;
385 if ( oldSize < newSize )
386 offsets.resize(newSize, 0);
389 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
390 if ( offsets[i] == 0 )
391 offsets[i] = lastOffset;
395 bool BamStandardIndex::Jump(const BamRegion& region) {
397 cerr << "Jumping in BAI" << endl;
399 if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) {
400 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
404 // see if left-bound reference of region has alignments
405 if ( !HasAlignments(region.LeftRefID) ) return false;
407 // make sure left-bound position is valid
408 if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) return false;
410 vector<int64_t> offsets;
411 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
412 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
416 // iterate through offsets
417 BamAlignment bAlignment;
419 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
421 // attempt seek & load first available alignment
422 result &= m_BGZF->Seek(*o);
423 m_reader->GetNextAlignmentCore(bAlignment);
425 // if this alignment corresponds to desired position
426 // return success of seeking back to 'current offset'
427 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
428 if ( o != offsets.begin() ) --o;
429 return m_BGZF->Seek(*o);
433 if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
437 bool BamStandardIndex::Load(const string& filename) {
439 // open index file, abort on error
440 FILE* indexStream = fopen(filename.c_str(), "rb");
442 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
446 // set placeholder to receive input byte count (suppresses compiler warnings)
447 size_t elementsRead = 0;
449 // see if index is valid BAM index
451 elementsRead = fread(magic, 1, 4, indexStream);
452 if ( strncmp(magic, "BAI\1", 4) ) {
453 fprintf(stderr, "Problem with index file - invalid format.\n");
458 // get number of reference sequences
460 elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
461 if ( m_isBigEndian ) SwapEndian_32(numRefSeqs);
463 // intialize space for BamStandardIndexData data structure
464 d->m_indexData.reserve(numRefSeqs);
466 // iterate over reference sequences
467 for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
469 // get number of bins for this reference sequence
471 elementsRead = fread(&numBins, 4, 1, indexStream);
472 if ( m_isBigEndian ) SwapEndian_32(numBins);
475 RefData& refEntry = m_references[i];
476 refEntry.RefHasAlignments = true;
479 // intialize BinVector
482 // iterate over bins for that reference sequence
483 for ( int j = 0; j < numBins; ++j ) {
487 elementsRead = fread(&binID, 4, 1, indexStream);
489 // get number of regionChunks in this bin
491 elementsRead = fread(&numChunks, 4, 1, indexStream);
493 if ( m_isBigEndian ) {
494 SwapEndian_32(binID);
495 SwapEndian_32(numChunks);
498 // intialize ChunkVector
499 ChunkVector regionChunks;
500 regionChunks.reserve(numChunks);
502 // iterate over regionChunks in this bin
503 for ( unsigned int k = 0; k < numChunks; ++k ) {
505 // get chunk boundaries (left, right)
508 elementsRead = fread(&left, 8, 1, indexStream);
509 elementsRead = fread(&right, 8, 1, indexStream);
511 if ( m_isBigEndian ) {
513 SwapEndian_64(right);
517 regionChunks.push_back( Chunk(left, right) );
520 // sort chunks for this bin
521 sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
523 // save binID, chunkVector for this bin
524 binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
527 // -----------------------------------------------------
528 // load linear index for this reference sequence
530 // get number of linear offsets
531 int32_t numLinearOffsets;
532 elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
533 if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
535 // intialize LinearOffsetVector
536 LinearOffsetVector offsets;
537 offsets.reserve(numLinearOffsets);
539 // iterate over linear offsets for this reference sequeence
540 uint64_t linearOffset;
541 for ( int j = 0; j < numLinearOffsets; ++j ) {
542 // read a linear offset & store
543 elementsRead = fread(&linearOffset, 8, 1, indexStream);
544 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
545 offsets.push_back(linearOffset);
548 // sort linear offsets
549 sort( offsets.begin(), offsets.end() );
551 // store index data for that reference sequence
552 d->m_indexData.push_back( ReferenceIndex(binMap, offsets) );
555 // close index file (.bai) and return
560 // merges 'alignment chunks' in BAM bin (used for index building)
561 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
563 // iterate over reference enties
564 BamStandardIndexData::iterator indexIter = m_indexData.begin();
565 BamStandardIndexData::iterator indexEnd = m_indexData.end();
566 for ( ; indexIter != indexEnd; ++indexIter ) {
568 // get BAM bin map for this reference
569 ReferenceIndex& refIndex = (*indexIter);
570 BamBinMap& bamBinMap = refIndex.Bins;
572 // iterate over BAM bins
573 BamBinMap::iterator binIter = bamBinMap.begin();
574 BamBinMap::iterator binEnd = bamBinMap.end();
575 for ( ; binIter != binEnd; ++binIter ) {
577 // get chunk vector for this bin
578 ChunkVector& binChunks = (*binIter).second;
579 if ( binChunks.size() == 0 ) continue;
581 ChunkVector mergedChunks;
582 mergedChunks.push_back( binChunks[0] );
584 // iterate over chunks
586 ChunkVector::iterator chunkIter = binChunks.begin();
587 ChunkVector::iterator chunkEnd = binChunks.end();
588 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
590 // get 'currentChunk' based on numeric index
591 Chunk& currentChunk = mergedChunks[i];
593 // get iteratorChunk based on vector iterator
594 Chunk& iteratorChunk = (*chunkIter);
596 // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
597 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
599 // set currentChunk.Stop to iteratorChunk.Stop
600 currentChunk.Stop = iteratorChunk.Stop;
605 // set currentChunk + 1 to iteratorChunk
606 mergedChunks.push_back(iteratorChunk);
611 // saved merged chunk vector
612 (*binIter).second = mergedChunks;
617 // writes in-memory index data out to file
618 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
619 bool BamStandardIndex::Write(const std::string& bamFilename) {
621 string indexFilename = bamFilename + ".bai";
622 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
623 if ( indexStream == 0 ) {
624 fprintf(stderr, "ERROR: Could not open file to save index.\n");
628 // write BAM index header
629 fwrite("BAI\1", 1, 4, indexStream);
631 // write number of reference sequences
632 int32_t numReferenceSeqs = d->m_indexData.size();
633 if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
634 fwrite(&numReferenceSeqs, 4, 1, indexStream);
636 // iterate over reference sequences
637 BamStandardIndexData::const_iterator indexIter = d->m_indexData.begin();
638 BamStandardIndexData::const_iterator indexEnd = d->m_indexData.end();
639 for ( ; indexIter != indexEnd; ++ indexIter ) {
641 // get reference index data
642 const ReferenceIndex& refIndex = (*indexIter);
643 const BamBinMap& binMap = refIndex.Bins;
644 const LinearOffsetVector& offsets = refIndex.Offsets;
646 // write number of bins
647 int32_t binCount = binMap.size();
648 if ( m_isBigEndian ) SwapEndian_32(binCount);
649 fwrite(&binCount, 4, 1, indexStream);
652 BamBinMap::const_iterator binIter = binMap.begin();
653 BamBinMap::const_iterator binEnd = binMap.end();
654 for ( ; binIter != binEnd; ++binIter ) {
656 // get bin data (key and chunk vector)
657 uint32_t binKey = (*binIter).first;
658 const ChunkVector& binChunks = (*binIter).second;
661 if ( m_isBigEndian ) SwapEndian_32(binKey);
662 fwrite(&binKey, 4, 1, indexStream);
665 int32_t chunkCount = binChunks.size();
666 if ( m_isBigEndian ) SwapEndian_32(chunkCount);
667 fwrite(&chunkCount, 4, 1, indexStream);
669 // iterate over chunks
670 ChunkVector::const_iterator chunkIter = binChunks.begin();
671 ChunkVector::const_iterator chunkEnd = binChunks.end();
672 for ( ; chunkIter != chunkEnd; ++chunkIter ) {
674 // get current chunk data
675 const Chunk& chunk = (*chunkIter);
676 uint64_t start = chunk.Start;
677 uint64_t stop = chunk.Stop;
679 if ( m_isBigEndian ) {
680 SwapEndian_64(start);
684 // save chunk offsets
685 fwrite(&start, 8, 1, indexStream);
686 fwrite(&stop, 8, 1, indexStream);
690 // write linear offsets size
691 int32_t offsetSize = offsets.size();
692 if ( m_isBigEndian ) SwapEndian_32(offsetSize);
693 fwrite(&offsetSize, 4, 1, indexStream);
695 // iterate over linear offsets
696 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
697 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
698 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
700 // write linear offset value
701 uint64_t linearOffset = (*offsetIter);
702 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
703 fwrite(&linearOffset, 8, 1, indexStream);
707 // flush buffer, close file, and return success
713 // #########################################################################################
714 // #########################################################################################
716 // -------------------------------------
717 // BamToolsIndex implementation
721 struct BamToolsIndexEntry {
726 int32_t StartPosition;
729 BamToolsIndexEntry(const int64_t& offset = 0,
730 const int32_t& id = -1,
731 const int32_t& position = -1)
734 , StartPosition(position)
738 typedef vector<BamToolsIndexEntry> BamToolsIndexData;
740 } // namespace BamTools
742 struct BamToolsIndex::BamToolsIndexPrivate {
744 // -------------------------
746 BamToolsIndexData m_indexData;
747 BamToolsIndex* m_parent;
750 // -------------------------
753 BamToolsIndexPrivate(BamToolsIndex* parent)
758 ~BamToolsIndexPrivate(void) { }
760 // -------------------------
764 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
765 : BamIndex(bgzf, reader, isBigEndian)
767 d = new BamToolsIndexPrivate(this);
770 BamToolsIndex::~BamToolsIndex(void) {
775 bool BamToolsIndex::Build(void) {
777 // be sure reader & BGZF file are valid & open for reading
778 if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
781 // move file pointer to beginning of alignments
784 // plow through alignments, store block offsets
785 int32_t currentBlockCount = 0;
786 int64_t blockStartOffset = m_BGZF->Tell();
787 int32_t blockStartId = -1;
788 int32_t blockStartPosition = -1;
790 while ( m_reader->GetNextAlignmentCore(al) ) {
792 // set reference flag
793 m_references[al.RefID].RefHasAlignments = true;
795 // if beginning of block, save first alignment's refID & position
796 if ( currentBlockCount == 0 ) {
797 blockStartId = al.RefID;
798 blockStartPosition = al.Position;
801 // increment block counter
804 // if block is full, get offset for next block, reset currentBlockCount
805 if ( currentBlockCount == d->m_blockSize ) {
806 d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) );
807 blockStartOffset = m_BGZF->Tell();
808 currentBlockCount = 0;
812 return m_reader->Rewind();
815 // N.B. - ignores isRightBoundSpecified
816 bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
818 // return false if no index data present
819 if ( d->m_indexData.empty() ) return false;
821 // clear any prior data
825 /* bool found = false;
826 int64_t previousOffset = -1;
827 BamToolsIndexData::const_iterator indexIter = d->m_indexData.end() - 1;
828 BamToolsIndexData::const_iterator indexBegin = d->m_indexData.begin();
829 for ( ; indexIter >= indexBegin; --indexIter ) {
831 const BamToolsIndexEntry& entry = (*indexIter);
833 cerr << "Considering entry at " << entry.RefID << ":" << entry.Position << " => " << entry.Offset << endl;
835 if ( entry.RefID < region.LeftRefID) {
840 if ( (entry.RefID == region.LeftRefID) && (entry.Position < region.LeftPosition) ) {
845 previousOffset = entry.Offset;
848 if ( !found || previousOffset == -1 ) return false;
850 // store offset & return success
852 cerr << "Using offset: " << previousOffset << endl;
855 offsets.push_back(previousOffset);
859 // cerr << "--------------------------------------------------" << endl;
860 // cerr << "Looking for " << region.LeftRefID << ":" << region.LeftPosition << endl;
863 // calculate nearest index to jump to
864 int64_t previousOffset = -1;
865 BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
866 BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
867 for ( ; indexIter != indexEnd; ++indexIter ) {
869 const BamToolsIndexEntry& entry = (*indexIter);
871 // cerr << "Considering entry at " << entry.RefID << ":" << entry.StartPosition << " => " << entry.Offset << endl;
873 // check if we are 'past' beginning of desired region
874 // if so, we will break out & use previously stored offset
875 if ( entry.RefID > region.LeftRefID ) break;
876 if ( (entry.RefID == region.LeftRefID) && (entry.StartPosition >= region.LeftPosition) ) break;
878 // not past desired region, so store current entry offset in previousOffset
879 previousOffset = entry.Offset;
883 if ( indexIter == indexEnd ) return false;
885 // first offset matches (so previousOffset was never set)
886 if ( previousOffset == -1 ) previousOffset = (*indexIter).Offset;
888 // store offset & return success
890 // cerr << "Using offset: " << previousOffset << endl;
892 offsets.push_back(previousOffset);
896 bool BamToolsIndex::Jump(const BamRegion& region) {
898 if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) {
899 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
903 // see if left-bound reference of region has alignments
904 if ( !HasAlignments(region.LeftRefID) ) return false;
906 // make sure left-bound position is valid
907 if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) return false;
909 vector<int64_t> offsets;
910 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
911 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
915 // iterate through offsets
916 BamAlignment bAlignment;
918 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
920 // attempt seek & load first available alignment
921 result &= m_BGZF->Seek(*o);
922 m_reader->GetNextAlignmentCore(bAlignment);
924 // if this alignment corresponds to desired position
925 // return success of seeking back to 'current offset'
926 if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) {
927 if ( o != offsets.begin() ) --o;
928 return m_BGZF->Seek(*o);
932 if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
936 bool BamToolsIndex::Load(const string& filename) {
938 // open index file, abort on error
939 FILE* indexStream = fopen(filename.c_str(), "rb");
941 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
945 // set placeholder to receive input byte count (suppresses compiler warnings)
946 size_t elementsRead = 0;
948 // see if index is valid BAM index
950 elementsRead = fread(magic, 1, 4, indexStream);
951 if ( strncmp(magic, "BTI\1", 4) ) {
952 fprintf(stderr, "Problem with index file - invalid format.\n");
957 // read in block size
958 elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream);
959 if ( m_isBigEndian ) SwapEndian_32(d->m_blockSize);
961 // read in number of offsets
963 elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
964 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
966 // reserve space for index data
967 d->m_indexData.reserve(numOffsets);
969 // iterate over index entries
970 for ( unsigned int i = 0; i < numOffsets; ++i ) {
977 elementsRead = fread(&offset, sizeof(offset), 1, indexStream);
978 elementsRead = fread(&id, sizeof(id), 1, indexStream);
979 elementsRead = fread(&position, sizeof(position), 1, indexStream);
981 // swap endian-ness if necessary
982 if ( m_isBigEndian ) {
983 SwapEndian_64(offset);
985 SwapEndian_32(position);
988 // save reference index entry
989 d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) );
991 // set reference flag
992 m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag?
995 // close index file and return
1000 // writes in-memory index data out to file
1001 // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
1002 bool BamToolsIndex::Write(const std::string& bamFilename) {
1004 // open index file for writing
1005 string indexFilename = bamFilename + ".bti";
1006 FILE* indexStream = fopen(indexFilename.c_str(), "wb");
1007 if ( indexStream == 0 ) {
1008 fprintf(stderr, "ERROR: Could not open file to save index.\n");
1012 // write BAM index header
1013 fwrite("BTI\1", 1, 4, indexStream);
1016 int32_t blockSize = d->m_blockSize;
1017 if ( m_isBigEndian ) SwapEndian_32(blockSize);
1018 fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
1020 // write number of offset entries
1021 uint32_t numOffsets = d->m_indexData.size();
1022 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1023 fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
1025 // iterate over offset entries
1026 BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
1027 BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
1028 for ( ; indexIter != indexEnd; ++ indexIter ) {
1030 // get reference index data
1031 const BamToolsIndexEntry& entry = (*indexIter);
1034 int64_t offset = entry.Offset;
1035 int32_t id = entry.RefID;
1036 int32_t position = entry.StartPosition;
1038 // swap endian-ness if necessary
1039 if ( m_isBigEndian ) {
1040 SwapEndian_64(offset);
1042 SwapEndian_32(position);
1045 // write the reference index entry
1046 fwrite(&offset, sizeof(offset), 1, indexStream);
1047 fwrite(&id, sizeof(id), 1, indexStream);
1048 fwrite(&position, sizeof(position), 1, indexStream);
1051 // flush any remaining output, close file, and return success
1052 fflush(indexStream);
1053 fclose(indexStream);