1 // ***************************************************************************
\r
2 // BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 16 June 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for reading BAM files
\r
12 // ***************************************************************************
\r
15 #include <algorithm>
\r
20 // BamTools includes
\r
22 #include "BamReader.h"
\r
23 using namespace BamTools;
\r
24 using namespace std;
\r
26 struct BamReader::BamReaderPrivate {
\r
28 // -------------------------------
\r
30 // -------------------------------
\r
32 // general file data
\r
36 RefVector References;
\r
38 int64_t AlignmentsBeginOffset;
\r
40 string IndexFilename;
\r
45 // user-specified region values
\r
47 bool IsLeftBoundSpecified;
\r
48 bool IsRightBoundSpecified;
\r
50 bool IsRegionSpecified;
\r
54 // BAM character constants
\r
55 const char* DNA_LOOKUP;
\r
56 const char* CIGAR_LOOKUP;
\r
58 // -------------------------------
\r
59 // constructor & destructor
\r
60 // -------------------------------
\r
61 BamReaderPrivate(void);
\r
62 ~BamReaderPrivate(void);
\r
64 // -------------------------------
\r
65 // "public" interface
\r
66 // -------------------------------
\r
70 bool Jump(int refID, int position = 0);
\r
71 void Open(const string& filename, const string& indexFilename = "");
\r
73 bool SetRegion(const BamRegion& region);
\r
75 // access alignment data
\r
76 bool GetNextAlignment(BamAlignment& bAlignment);
\r
77 bool GetNextAlignmentCore(BamAlignment& bAlignment);
\r
79 // access auxiliary data
\r
80 int GetReferenceID(const string& refName) const;
\r
83 bool CreateIndex(void);
\r
85 // -------------------------------
\r
87 // -------------------------------
\r
89 // *** reading alignments and auxiliary data *** //
\r
91 // calculate bins that overlap region ( left to reference end for now )
\r
92 int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]);
\r
93 // fills out character data for BamAlignment data
\r
94 bool BuildCharData(BamAlignment& bAlignment);
\r
95 // calculate file offset for first alignment chunk overlapping 'left'
\r
96 int64_t GetOffset(int refID, int left);
\r
97 // checks to see if alignment overlaps current region
\r
98 bool IsOverlap(BamAlignment& bAlignment);
\r
99 // retrieves header text from BAM file
\r
100 void LoadHeaderData(void);
\r
101 // retrieves BAM alignment under file pointer
\r
102 bool LoadNextAlignment(BamAlignment& bAlignment);
\r
103 // builds reference data structure from BAM file
\r
104 void LoadReferenceData(void);
\r
106 // *** index file handling *** //
\r
108 // calculates index for BAM file
\r
109 bool BuildIndex(void);
\r
110 // clear out inernal index data structure
\r
111 void ClearIndex(void);
\r
112 // saves BAM bin entry for index
\r
113 void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
\r
114 // saves linear offset entry for index
\r
115 void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
\r
116 // loads index from BAM index file
\r
117 bool LoadIndex(void);
\r
118 // simplifies index by merging 'chunks'
\r
119 void MergeChunks(void);
\r
120 // saves index to BAM index file
\r
121 bool WriteIndex(void);
\r
124 // -----------------------------------------------------
\r
125 // BamReader implementation (wrapper around BRPrivate)
\r
126 // -----------------------------------------------------
\r
129 BamReader::BamReader(void) {
\r
130 d = new BamReaderPrivate;
\r
134 BamReader::~BamReader(void) {
\r
140 void BamReader::Close(void) { d->Close(); }
\r
141 bool BamReader::Jump(int refID, int position) {
\r
142 d->Region.LeftRefID = refID;
\r
143 d->Region.LeftPosition = position;
\r
144 d->IsLeftBoundSpecified = true;
\r
145 d->IsRightBoundSpecified = false;
\r
146 return d->Jump(refID, position);
\r
148 void BamReader::Open(const string& filename, const string& indexFilename) { d->Open(filename, indexFilename); }
\r
149 bool BamReader::Rewind(void) { return d->Rewind(); }
\r
150 bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }
\r
151 bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {
\r
152 return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );
\r
155 // access alignment data
\r
156 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }
\r
157 bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }
\r
159 // access auxiliary data
\r
160 const string BamReader::GetHeaderText(void) const { return d->HeaderText; }
\r
161 int BamReader::GetReferenceCount(void) const { return d->References.size(); }
\r
162 const RefVector BamReader::GetReferenceData(void) const { return d->References; }
\r
163 int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }
\r
164 const std::string BamReader::GetFilename(void) const { return d->Filename; }
\r
166 // index operations
\r
167 bool BamReader::CreateIndex(void) { return d->CreateIndex(); }
\r
169 // -----------------------------------------------------
\r
170 // BamReaderPrivate implementation
\r
171 // -----------------------------------------------------
\r
174 BamReader::BamReaderPrivate::BamReaderPrivate(void)
\r
175 : IsIndexLoaded(false)
\r
176 , AlignmentsBeginOffset(0)
\r
177 , IsLeftBoundSpecified(false)
\r
178 , IsRightBoundSpecified(false)
\r
179 , IsRegionSpecified(false)
\r
182 , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
\r
183 , CIGAR_LOOKUP("MIDNSHP")
\r
185 IsBigEndian = SystemIsBigEndian();
\r
189 BamReader::BamReaderPrivate::~BamReaderPrivate(void) {
\r
193 // calculate bins that overlap region ( left to reference end for now )
\r
194 int BamReader::BamReaderPrivate::BinsFromRegion(int refID, int left, uint16_t list[MAX_BIN]) {
\r
196 // get region boundaries
\r
197 uint32_t begin = (unsigned int)left;
\r
198 uint32_t end = (unsigned int)References.at(refID).RefLength - 1;
\r
200 // initialize list, bin '0' always a valid bin
\r
204 // get rest of bins that contain this region
\r
206 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
\r
207 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
\r
208 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
\r
209 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
\r
210 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
\r
212 // return number of bins stored
\r
216 bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {
\r
218 // calculate character lengths/offsets
\r
219 const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
\r
220 const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);
\r
221 const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;
\r
222 const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;
\r
223 const unsigned int tagDataLength = dataLength - tagDataOffset;
\r
225 // set up char buffers
\r
226 const char* allCharData = bAlignment.SupportData.AllCharData.data();
\r
227 const char* seqData = ((const char*)allCharData) + seqDataOffset;
\r
228 const char* qualData = ((const char*)allCharData) + qualDataOffset;
\r
229 char* tagData = ((char*)allCharData) + tagDataOffset;
\r
231 // save query sequence
\r
232 bAlignment.QueryBases.clear();
\r
233 bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);
\r
234 for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
\r
235 char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
\r
236 bAlignment.QueryBases.append(1, singleBase);
\r
239 // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
\r
240 bAlignment.Qualities.clear();
\r
241 bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);
\r
242 for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
\r
243 char singleQuality = (char)(qualData[i]+33);
\r
244 bAlignment.Qualities.append(1, singleQuality);
\r
247 // parse CIGAR to build 'AlignedBases'
\r
248 bAlignment.AlignedBases.clear();
\r
249 bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);
\r
252 vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();
\r
253 vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();
\r
254 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
\r
256 const CigarOp& op = (*cigarIter);
\r
261 bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases
\r
265 k += op.Length; // for 'S' - soft clip, skip over query bases
\r
269 bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character
\r
273 bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character
\r
277 bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence
\r
282 break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op
\r
285 printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
\r
290 // -----------------------
\r
291 // Added: 3-25-2010 DWB
\r
292 // Fixed: endian-correctness for tag data
\r
293 // -----------------------
\r
294 if ( IsBigEndian ) {
\r
296 while ( (unsigned int)i < tagDataLength ) {
\r
298 i += 2; // skip tag type (e.g. "RG", "NM", etc)
\r
299 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
\r
300 ++i; // skip value type
\r
310 SwapEndian_16p(&tagData[i]);
\r
311 i+=2; // sizeof(uint16_t)
\r
316 SwapEndian_32p(&tagData[i]);
\r
317 i+=4; // sizeof(uint32_t)
\r
321 SwapEndian_64p(&tagData[i]);
\r
322 i+=8; // sizeof(uint64_t)
\r
327 while (tagData[i]) { ++i; }
\r
328 ++i; // increment one more for null terminator
\r
332 printf("ERROR: Invalid tag value type\n"); // shouldn't get here
\r
339 bAlignment.TagData.clear();
\r
340 bAlignment.TagData.resize(tagDataLength);
\r
341 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);
\r
343 // set support data parsed flag
\r
344 bAlignment.SupportData.IsParsed = true;
\r
350 // populates BAM index data structure from BAM file data
\r
351 bool BamReader::BamReaderPrivate::BuildIndex(void) {
\r
353 // check to be sure file is open
\r
354 if (!mBGZF.IsOpen) { return false; }
\r
356 // move file pointer to beginning of alignments
\r
359 // get reference count, reserve index space
\r
360 int numReferences = References.size();
\r
361 for ( int i = 0; i < numReferences; ++i ) {
\r
362 Index.push_back(ReferenceIndex());
\r
365 // sets default constant for bin, ID, offset, coordinate variables
\r
366 const uint32_t defaultValue = 0xffffffffu;
\r
369 uint32_t saveBin(defaultValue);
\r
370 uint32_t lastBin(defaultValue);
\r
372 // reference ID data
\r
373 int32_t saveRefID(defaultValue);
\r
374 int32_t lastRefID(defaultValue);
\r
377 uint64_t saveOffset = mBGZF.Tell();
\r
378 uint64_t lastOffset = saveOffset;
\r
381 int32_t lastCoordinate = defaultValue;
\r
383 BamAlignment bAlignment;
\r
384 while( GetNextAlignment(bAlignment) ) {
\r
386 // change of chromosome, save ID, reset bin
\r
387 if ( lastRefID != bAlignment.RefID ) {
\r
388 lastRefID = bAlignment.RefID;
\r
389 lastBin = defaultValue;
\r
392 // if lastCoordinate greater than BAM position - file not sorted properly
\r
393 else if ( lastCoordinate > bAlignment.Position ) {
\r
394 printf("BAM file not properly sorted:\n");
\r
395 printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
\r
399 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
\r
400 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
\r
402 // save linear offset entry (matched to BAM entry refID)
\r
403 ReferenceIndex& refIndex = Index.at(bAlignment.RefID);
\r
404 LinearOffsetVector& offsets = refIndex.Offsets;
\r
405 InsertLinearOffset(offsets, bAlignment, lastOffset);
\r
408 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
\r
409 if ( bAlignment.Bin != lastBin ) {
\r
411 // if not first time through
\r
412 if ( saveBin != defaultValue ) {
\r
414 // save Bam bin entry
\r
415 ReferenceIndex& refIndex = Index.at(saveRefID);
\r
416 BamBinMap& binMap = refIndex.Bins;
\r
417 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
\r
420 // update saveOffset
\r
421 saveOffset = lastOffset;
\r
423 // update bin values
\r
424 saveBin = bAlignment.Bin;
\r
425 lastBin = bAlignment.Bin;
\r
427 // update saveRefID
\r
428 saveRefID = bAlignment.RefID;
\r
430 // if invalid RefID, break out (why?)
\r
431 if ( saveRefID < 0 ) { break; }
\r
434 // make sure that current file pointer is beyond lastOffset
\r
435 if ( mBGZF.Tell() <= (int64_t)lastOffset ) {
\r
436 printf("Error in BGZF offsets.\n");
\r
440 // update lastOffset
\r
441 lastOffset = mBGZF.Tell();
\r
443 // update lastCoordinate
\r
444 lastCoordinate = bAlignment.Position;
\r
447 // save any leftover BAM data (as long as refID is valid)
\r
448 if ( saveRefID >= 0 ) {
\r
449 // save Bam bin entry
\r
450 ReferenceIndex& refIndex = Index.at(saveRefID);
\r
451 BamBinMap& binMap = refIndex.Bins;
\r
452 InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
\r
455 // simplify index by merging chunks
\r
458 // iterate over references
\r
459 BamIndex::iterator indexIter = Index.begin();
\r
460 BamIndex::iterator indexEnd = Index.end();
\r
461 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
\r
463 // get reference index data
\r
464 ReferenceIndex& refIndex = (*indexIter);
\r
465 BamBinMap& binMap = refIndex.Bins;
\r
466 LinearOffsetVector& offsets = refIndex.Offsets;
\r
468 // store whether reference has alignments or no
\r
469 References[i].RefHasAlignments = ( binMap.size() > 0 );
\r
471 // sort linear offsets
\r
472 sort(offsets.begin(), offsets.end());
\r
476 // rewind file pointer to beginning of alignments, return success/fail
\r
481 // clear index data structure
\r
482 void BamReader::BamReaderPrivate::ClearIndex(void) {
\r
483 Index.clear(); // sufficient ??
\r
486 // closes the BAM file
\r
487 void BamReader::BamReaderPrivate::Close(void) {
\r
490 HeaderText.clear();
\r
491 IsLeftBoundSpecified = false;
\r
492 IsRightBoundSpecified = false;
\r
493 IsRegionSpecified = false;
\r
496 // create BAM index from BAM file (keep structure in memory) and write to default index output file
\r
497 bool BamReader::BamReaderPrivate::CreateIndex(void) {
\r
502 // build (& save) index from BAM file
\r
504 ok &= BuildIndex();
\r
505 ok &= WriteIndex();
\r
507 // return success/fail
\r
511 // returns RefID for given RefName (returns References.size() if not found)
\r
512 int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {
\r
514 // retrieve names from reference data
\r
515 vector<string> refNames;
\r
516 RefVector::const_iterator refIter = References.begin();
\r
517 RefVector::const_iterator refEnd = References.end();
\r
518 for ( ; refIter != refEnd; ++refIter) {
\r
519 refNames.push_back( (*refIter).RefName );
\r
522 // return 'index-of' refName ( if not found, returns refNames.size() )
\r
523 return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
\r
526 // get next alignment (from specified region, if given)
\r
527 bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {
\r
529 // if valid alignment found, attempt to parse char data, and return success/failure
\r
530 if ( GetNextAlignmentCore(bAlignment) )
\r
531 return BuildCharData(bAlignment);
\r
533 // no valid alignment found
\r
538 // retrieves next available alignment core data (returns success/fail)
\r
539 // ** DOES NOT parse any character data (bases, qualities, tag data)
\r
540 // these can be accessed, if necessary, from the supportData
\r
541 // useful for operations requiring ONLY positional or other alignment-related information
\r
542 bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {
\r
544 // if valid alignment available
\r
545 if ( LoadNextAlignment(bAlignment) ) {
\r
547 // if region not specified, return success
\r
548 if ( !IsLeftBoundSpecified ) return true;
\r
550 // load next alignment until region overlap is found
\r
551 while ( !IsOverlap(bAlignment) ) {
\r
552 // if no valid alignment available (likely EOF) return failure
\r
553 if ( !LoadNextAlignment(bAlignment) ) return false;
\r
556 // return success (alignment found that overlaps region)
\r
560 // no valid alignment
\r
565 // calculate closest indexed file offset for region specified
\r
566 int64_t BamReader::BamReaderPrivate::GetOffset(int refID, int left) {
\r
568 // calculate which bins overlap this region
\r
569 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
\r
570 int numBins = BinsFromRegion(refID, left, bins);
\r
572 // get bins for this reference
\r
573 const ReferenceIndex& refIndex = Index.at(refID);
\r
574 const BamBinMap& binMap = refIndex.Bins;
\r
576 // get minimum offset to consider
\r
577 const LinearOffsetVector& offsets = refIndex.Offsets;
\r
578 uint64_t minOffset = ( (unsigned int)(left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT);
\r
580 // store offsets to beginning of alignment 'chunks'
\r
581 std::vector<int64_t> chunkStarts;
\r
583 // store all alignment 'chunk' starts for bins in this region
\r
584 for (int i = 0; i < numBins; ++i ) {
\r
585 uint16_t binKey = bins[i];
\r
587 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
\r
588 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
\r
590 const ChunkVector& chunks = (*binIter).second;
\r
591 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
\r
592 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
\r
593 for ( ; chunksIter != chunksEnd; ++chunksIter) {
\r
594 const Chunk& chunk = (*chunksIter);
\r
595 if ( chunk.Stop > minOffset ) {
\r
596 chunkStarts.push_back( chunk.Start );
\r
605 // if no alignments found, else return smallest offset for alignment starts
\r
606 if ( chunkStarts.size() == 0 ) { return -1; }
\r
607 else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
\r
610 // saves BAM bin entry for index
\r
611 void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap,
\r
612 const uint32_t& saveBin,
\r
613 const uint64_t& saveOffset,
\r
614 const uint64_t& lastOffset)
\r
617 BamBinMap::iterator binIter = binMap.find(saveBin);
\r
619 // create new chunk
\r
620 Chunk newChunk(saveOffset, lastOffset);
\r
622 // if entry doesn't exist
\r
623 if ( binIter == binMap.end() ) {
\r
624 ChunkVector newChunks;
\r
625 newChunks.push_back(newChunk);
\r
626 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
\r
631 ChunkVector& binChunks = (*binIter).second;
\r
632 binChunks.push_back( newChunk );
\r
636 // saves linear offset entry for index
\r
637 void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
\r
638 const BamAlignment& bAlignment,
\r
639 const uint64_t& lastOffset)
\r
641 // get converted offsets
\r
642 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
\r
643 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
\r
645 // resize vector if necessary
\r
646 int oldSize = offsets.size();
\r
647 int newSize = endOffset + 1;
\r
648 if ( oldSize < newSize ) { offsets.resize(newSize, 0); }
\r
651 for(int i = beginOffset + 1; i <= endOffset ; ++i) {
\r
652 if ( offsets[i] == 0) {
\r
653 offsets[i] = lastOffset;
\r
658 // returns whether alignment overlaps currently specified region
\r
659 // this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
\r
660 bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
\r
662 // --------------------------------------------------
\r
663 // check alignment start against right bound cutoff
\r
665 // if full region of interest was given
\r
666 if ( IsRightBoundSpecified ) {
\r
668 // read starts on right bound reference, but AFTER right bound position
\r
669 if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition )
\r
672 // if read starts on reference AFTER right bound, return false
\r
673 if ( bAlignment.RefID > Region.RightRefID )
\r
677 // --------------------------------------------------------
\r
678 // no right bound given OR read starts before right bound
\r
679 // so, check if it overlaps left bound
\r
681 // if read starts on left bound reference AND after left boundary, return success
\r
682 if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition)
\r
685 // if read is on any reference sequence before left bound, return false
\r
686 if ( bAlignment.RefID < Region.LeftRefID )
\r
689 // read is on left bound reference, but starts before left bound position
\r
690 // calculate the alignment end to see if it overlaps
\r
691 return ( bAlignment.GetEndPosition() >= Region.LeftPosition );
\r
694 // jumps to specified region(refID, leftBound) in BAM file, returns success/fail
\r
695 bool BamReader::BamReaderPrivate::Jump(int refID, int position) {
\r
697 // if data exists for this reference and position is valid
\r
698 if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
\r
700 // calculate offset
\r
701 int64_t offset = GetOffset(Region.LeftRefID, Region.LeftPosition);
\r
703 // if in valid offset, return failure
\r
704 // otherwise return success of seek operation
\r
705 if ( offset == -1 )
\r
708 return mBGZF.Seek(offset);
\r
711 // invalid jump request parameters, return failure
\r
715 // load BAM header data
\r
716 void BamReader::BamReaderPrivate::LoadHeaderData(void) {
\r
718 // check to see if proper BAM header
\r
720 if (mBGZF.Read(buffer, 4) != 4) {
\r
721 printf("Could not read header type\n");
\r
725 if (strncmp(buffer, "BAM\001", 4)) {
\r
726 printf("wrong header type!\n");
\r
730 // get BAM header text length
\r
731 mBGZF.Read(buffer, 4);
\r
732 unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
\r
733 if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }
\r
735 // get BAM header text
\r
736 char* headerText = (char*)calloc(headerTextLength + 1, 1);
\r
737 mBGZF.Read(headerText, headerTextLength);
\r
738 HeaderText = (string)((const char*)headerText);
\r
740 // clean up calloc-ed temp variable
\r
744 // load existing index data from BAM index file (".bai"), return success/fail
\r
745 bool BamReader::BamReaderPrivate::LoadIndex(void) {
\r
747 // clear out index data
\r
750 // skip if index file empty
\r
751 if ( IndexFilename.empty() ) { return false; }
\r
753 // open index file, abort on error
\r
754 FILE* indexStream = fopen(IndexFilename.c_str(), "rb");
\r
756 printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() );
\r
760 size_t elementsRead = 0;
\r
762 // see if index is valid BAM index
\r
764 elementsRead = fread(magic, 1, 4, indexStream);
\r
765 if (strncmp(magic, "BAI\1", 4)) {
\r
766 printf("Problem with index file - invalid format.\n");
\r
767 fclose(indexStream);
\r
771 // get number of reference sequences
\r
772 uint32_t numRefSeqs;
\r
773 elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
\r
774 if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); }
\r
776 // intialize space for BamIndex data structure
\r
777 Index.reserve(numRefSeqs);
\r
779 // iterate over reference sequences
\r
780 for (unsigned int i = 0; i < numRefSeqs; ++i) {
\r
782 // get number of bins for this reference sequence
\r
784 elementsRead = fread(&numBins, 4, 1, indexStream);
\r
785 if ( IsBigEndian ) { SwapEndian_32(numBins); }
\r
788 RefData& refEntry = References[i];
\r
789 refEntry.RefHasAlignments = true;
\r
792 // intialize BinVector
\r
795 // iterate over bins for that reference sequence
\r
796 for (int j = 0; j < numBins; ++j) {
\r
800 elementsRead = fread(&binID, 4, 1, indexStream);
\r
802 // get number of regionChunks in this bin
\r
803 uint32_t numChunks;
\r
804 elementsRead = fread(&numChunks, 4, 1, indexStream);
\r
806 if ( IsBigEndian ) {
\r
807 SwapEndian_32(binID);
\r
808 SwapEndian_32(numChunks);
\r
811 // intialize ChunkVector
\r
812 ChunkVector regionChunks;
\r
813 regionChunks.reserve(numChunks);
\r
815 // iterate over regionChunks in this bin
\r
816 for (unsigned int k = 0; k < numChunks; ++k) {
\r
818 // get chunk boundaries (left, right)
\r
821 elementsRead = fread(&left, 8, 1, indexStream);
\r
822 elementsRead = fread(&right, 8, 1, indexStream);
\r
824 if ( IsBigEndian ) {
\r
825 SwapEndian_64(left);
\r
826 SwapEndian_64(right);
\r
830 regionChunks.push_back( Chunk(left, right) );
\r
833 // sort chunks for this bin
\r
834 sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
\r
836 // save binID, chunkVector for this bin
\r
837 binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
\r
840 // load linear index for this reference sequence
\r
842 // get number of linear offsets
\r
843 int32_t numLinearOffsets;
\r
844 elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
\r
845 if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); }
\r
847 // intialize LinearOffsetVector
\r
848 LinearOffsetVector offsets;
\r
849 offsets.reserve(numLinearOffsets);
\r
851 // iterate over linear offsets for this reference sequeence
\r
852 uint64_t linearOffset;
\r
853 for (int j = 0; j < numLinearOffsets; ++j) {
\r
854 // read a linear offset & store
\r
855 elementsRead = fread(&linearOffset, 8, 1, indexStream);
\r
856 if ( IsBigEndian ) { SwapEndian_64(linearOffset); }
\r
857 offsets.push_back(linearOffset);
\r
860 // sort linear offsets
\r
861 sort( offsets.begin(), offsets.end() );
\r
863 // store index data for that reference sequence
\r
864 Index.push_back( ReferenceIndex(binMap, offsets) );
\r
867 // close index file (.bai) and return
\r
868 fclose(indexStream);
\r
872 // populates BamAlignment with alignment data under file pointer, returns success/fail
\r
873 bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
\r
875 // read in the 'block length' value, make sure it's not zero
\r
877 mBGZF.Read(buffer, 4);
\r
878 bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);
\r
879 if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }
\r
880 if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }
\r
882 // read in core alignment data, make sure the right size of data was read
\r
883 char x[BAM_CORE_SIZE];
\r
884 if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
\r
886 if ( IsBigEndian ) {
\r
887 for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) {
\r
888 SwapEndian_32p(&x[i]);
\r
892 // set BamAlignment 'core' and 'support' data
\r
893 bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);
\r
894 bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);
\r
896 unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);
\r
897 bAlignment.Bin = tempValue >> 16;
\r
898 bAlignment.MapQuality = tempValue >> 8 & 0xff;
\r
899 bAlignment.SupportData.QueryNameLength = tempValue & 0xff;
\r
901 tempValue = BgzfData::UnpackUnsignedInt(&x[12]);
\r
902 bAlignment.AlignmentFlag = tempValue >> 16;
\r
903 bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;
\r
905 bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);
\r
906 bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);
\r
907 bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);
\r
908 bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);
\r
910 // store 'all char data' and cigar ops
\r
911 const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
\r
912 const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;
\r
914 char* allCharData = (char*)calloc(sizeof(char), dataLength);
\r
915 uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
\r
917 // read in character data - make sure proper data size was read
\r
918 if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; }
\r
921 // store alignment name and length
\r
922 bAlignment.Name.assign((const char*)(allCharData));
\r
923 bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;
\r
925 // store remaining 'allCharData' in supportData structure
\r
926 bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
\r
928 // save CigarOps for BamAlignment
\r
929 bAlignment.CigarData.clear();
\r
930 for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {
\r
932 // swap if necessary
\r
933 if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }
\r
935 // build CigarOp structure
\r
937 op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
\r
938 op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
\r
941 bAlignment.CigarData.push_back(op);
\r
949 // loads reference data from BAM file
\r
950 void BamReader::BamReaderPrivate::LoadReferenceData(void) {
\r
952 // get number of reference sequences
\r
954 mBGZF.Read(buffer, 4);
\r
955 unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);
\r
956 if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }
\r
957 if (numberRefSeqs == 0) { return; }
\r
958 References.reserve((int)numberRefSeqs);
\r
960 // iterate over all references in header
\r
961 for (unsigned int i = 0; i != numberRefSeqs; ++i) {
\r
963 // get length of reference name
\r
964 mBGZF.Read(buffer, 4);
\r
965 unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);
\r
966 if ( IsBigEndian ) { SwapEndian_32(refNameLength); }
\r
967 char* refName = (char*)calloc(refNameLength, 1);
\r
969 // get reference name and reference sequence length
\r
970 mBGZF.Read(refName, refNameLength);
\r
971 mBGZF.Read(buffer, 4);
\r
972 int refLength = BgzfData::UnpackSignedInt(buffer);
\r
973 if ( IsBigEndian ) { SwapEndian_32(refLength); }
\r
975 // store data for reference
\r
976 RefData aReference;
\r
977 aReference.RefName = (string)((const char*)refName);
\r
978 aReference.RefLength = refLength;
\r
979 References.push_back(aReference);
\r
981 // clean up calloc-ed temp variable
\r
986 // merges 'alignment chunks' in BAM bin (used for index building)
\r
987 void BamReader::BamReaderPrivate::MergeChunks(void) {
\r
989 // iterate over reference enties
\r
990 BamIndex::iterator indexIter = Index.begin();
\r
991 BamIndex::iterator indexEnd = Index.end();
\r
992 for ( ; indexIter != indexEnd; ++indexIter ) {
\r
994 // get BAM bin map for this reference
\r
995 ReferenceIndex& refIndex = (*indexIter);
\r
996 BamBinMap& bamBinMap = refIndex.Bins;
\r
998 // iterate over BAM bins
\r
999 BamBinMap::iterator binIter = bamBinMap.begin();
\r
1000 BamBinMap::iterator binEnd = bamBinMap.end();
\r
1001 for ( ; binIter != binEnd; ++binIter ) {
\r
1003 // get chunk vector for this bin
\r
1004 ChunkVector& binChunks = (*binIter).second;
\r
1005 if ( binChunks.size() == 0 ) { continue; }
\r
1007 ChunkVector mergedChunks;
\r
1008 mergedChunks.push_back( binChunks[0] );
\r
1010 // iterate over chunks
\r
1012 ChunkVector::iterator chunkIter = binChunks.begin();
\r
1013 ChunkVector::iterator chunkEnd = binChunks.end();
\r
1014 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
\r
1016 // get 'currentChunk' based on numeric index
\r
1017 Chunk& currentChunk = mergedChunks[i];
\r
1019 // get iteratorChunk based on vector iterator
\r
1020 Chunk& iteratorChunk = (*chunkIter);
\r
1022 // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
\r
1023 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
\r
1025 // set currentChunk.Stop to iteratorChunk.Stop
\r
1026 currentChunk.Stop = iteratorChunk.Stop;
\r
1031 // set currentChunk + 1 to iteratorChunk
\r
1032 mergedChunks.push_back(iteratorChunk);
\r
1037 // saved merged chunk vector
\r
1038 (*binIter).second = mergedChunks;
\r
1043 // opens BAM file (and index)
\r
1044 void BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {
\r
1046 Filename = filename;
\r
1047 IndexFilename = indexFilename;
\r
1049 // open the BGZF file for reading, retrieve header text & reference data
\r
1050 mBGZF.Open(filename, "rb");
\r
1052 LoadReferenceData();
\r
1054 // store file offset of first alignment
\r
1055 AlignmentsBeginOffset = mBGZF.Tell();
\r
1057 // open index file & load index data (if exists)
\r
1058 if ( !IndexFilename.empty() ) {
\r
1063 // returns BAM file pointer to beginning of alignment data
\r
1064 bool BamReader::BamReaderPrivate::Rewind(void) {
\r
1066 // find first reference that has alignments in the BAM file
\r
1068 int refCount = References.size();
\r
1069 for ( ; refID < refCount; ++refID ) {
\r
1070 if ( References.at(refID).RefHasAlignments ) { break; }
\r
1073 // reset default region info
\r
1074 Region.LeftRefID = refID;
\r
1075 Region.LeftPosition = 0;
\r
1076 Region.RightRefID = -1;
\r
1077 Region.RightPosition = -1;
\r
1078 IsLeftBoundSpecified = false;
\r
1079 IsRightBoundSpecified = false;
\r
1081 // return success/failure of seek
\r
1082 return mBGZF.Seek(AlignmentsBeginOffset);
\r
1085 // sets a region of interest (with left & right bound reference/position)
\r
1086 // attempts a Jump() to left bound as well
\r
1087 // returns success/failure of Jump()
\r
1088 bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {
\r
1090 // save region of interest
\r
1094 if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 )
\r
1095 IsLeftBoundSpecified = true;
\r
1096 if ( region.RightRefID >= 0 && region.RightPosition >= 0 )
\r
1097 IsRightBoundSpecified = true;
\r
1099 // attempt jump to beginning of region, return success/fail of Jump()
\r
1100 return Jump( Region.LeftRefID, Region.LeftPosition );
\r
1103 // saves index data to BAM index file (".bai"), returns success/fail
\r
1104 bool BamReader::BamReaderPrivate::WriteIndex(void) {
\r
1106 IndexFilename = Filename + ".bai";
\r
1107 FILE* indexStream = fopen(IndexFilename.c_str(), "wb");
\r
1108 if ( indexStream == 0 ) {
\r
1109 printf("ERROR: Could not open file to save index\n");
\r
1113 // write BAM index header
\r
1114 fwrite("BAI\1", 1, 4, indexStream);
\r
1116 // write number of reference sequences
\r
1117 int32_t numReferenceSeqs = Index.size();
\r
1118 if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); }
\r
1119 fwrite(&numReferenceSeqs, 4, 1, indexStream);
\r
1121 // iterate over reference sequences
\r
1122 BamIndex::const_iterator indexIter = Index.begin();
\r
1123 BamIndex::const_iterator indexEnd = Index.end();
\r
1124 for ( ; indexIter != indexEnd; ++ indexIter ) {
\r
1126 // get reference index data
\r
1127 const ReferenceIndex& refIndex = (*indexIter);
\r
1128 const BamBinMap& binMap = refIndex.Bins;
\r
1129 const LinearOffsetVector& offsets = refIndex.Offsets;
\r
1131 // write number of bins
\r
1132 int32_t binCount = binMap.size();
\r
1133 if ( IsBigEndian ) { SwapEndian_32(binCount); }
\r
1134 fwrite(&binCount, 4, 1, indexStream);
\r
1136 // iterate over bins
\r
1137 BamBinMap::const_iterator binIter = binMap.begin();
\r
1138 BamBinMap::const_iterator binEnd = binMap.end();
\r
1139 for ( ; binIter != binEnd; ++binIter ) {
\r
1141 // get bin data (key and chunk vector)
\r
1142 uint32_t binKey = (*binIter).first;
\r
1143 const ChunkVector& binChunks = (*binIter).second;
\r
1145 // save BAM bin key
\r
1146 if ( IsBigEndian ) { SwapEndian_32(binKey); }
\r
1147 fwrite(&binKey, 4, 1, indexStream);
\r
1149 // save chunk count
\r
1150 int32_t chunkCount = binChunks.size();
\r
1151 if ( IsBigEndian ) { SwapEndian_32(chunkCount); }
\r
1152 fwrite(&chunkCount, 4, 1, indexStream);
\r
1154 // iterate over chunks
\r
1155 ChunkVector::const_iterator chunkIter = binChunks.begin();
\r
1156 ChunkVector::const_iterator chunkEnd = binChunks.end();
\r
1157 for ( ; chunkIter != chunkEnd; ++chunkIter ) {
\r
1159 // get current chunk data
\r
1160 const Chunk& chunk = (*chunkIter);
\r
1161 uint64_t start = chunk.Start;
\r
1162 uint64_t stop = chunk.Stop;
\r
1164 if ( IsBigEndian ) {
\r
1165 SwapEndian_64(start);
\r
1166 SwapEndian_64(stop);
\r
1169 // save chunk offsets
\r
1170 fwrite(&start, 8, 1, indexStream);
\r
1171 fwrite(&stop, 8, 1, indexStream);
\r
1175 // write linear offsets size
\r
1176 int32_t offsetSize = offsets.size();
\r
1177 if ( IsBigEndian ) { SwapEndian_32(offsetSize); }
\r
1178 fwrite(&offsetSize, 4, 1, indexStream);
\r
1180 // iterate over linear offsets
\r
1181 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
\r
1182 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
\r
1183 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
\r
1185 // write linear offset value
\r
1186 uint64_t linearOffset = (*offsetIter);
\r
1187 if ( IsBigEndian ) { SwapEndian_64(linearOffset); }
\r
1188 fwrite(&linearOffset, 8, 1, indexStream);
\r
1192 // flush buffer, close file, and return success
\r
1193 fflush(indexStream);
\r
1194 fclose(indexStream);
\r