4 // Marth Lab, Boston College
\r
5 // Last modified: 6 April 2009
\r
7 #include "BamReader.h"
\r
12 // static character constants
\r
13 const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
\r
14 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
\r
16 BamReader::BamReader(const char* filename, const char* indexFilename)
\r
17 : m_filename( (char*)filename )
\r
18 , m_indexFilename( (char*)indexFilename )
\r
23 , m_isIndexLoaded(false)
\r
24 , m_isRegionSpecified(false)
\r
25 , m_isCalculateAlignedBases(true)
\r
28 , m_alignmentsBeginOffset(0)
\r
33 BamReader::~BamReader(void) {
\r
36 if ( m_isOpen ) { Close(); }
\r
40 bool BamReader::Open(void) {
\r
42 if (!m_isOpen && m_filename != NULL ) {
\r
45 m_file = bam_open(m_filename, "r");
\r
47 // get header info && index info
\r
48 if ( (m_file != NULL) && LoadHeader() ) {
\r
50 // save file offset where alignments start
\r
51 m_alignmentsBeginOffset = bam_tell(m_file);
\r
57 // try to open (and load) index data, if index file given
\r
58 if ( m_indexFilename != NULL ) {
\r
66 bool BamReader::OpenIndex(void) {
\r
68 if ( m_indexFilename && !m_isIndexLoaded ) {
\r
69 m_isIndexLoaded = LoadIndex();
\r
71 return m_isIndexLoaded;
\r
75 bool BamReader::Close(void) {
\r
80 int ret = bam_close(m_file);
\r
82 // delete index info
\r
83 if ( m_index != NULL) { delete m_index; }
\r
89 m_isIndexLoaded = false;
\r
91 // clear region flag
\r
92 m_isRegionSpecified = false;
\r
94 // return success/fail of bam_close
\r
101 // get BAM filename
\r
102 const char* BamReader::Filename(void) const {
\r
103 return (const char*)m_filename;
\r
106 // set BAM filename
\r
107 void BamReader::SetFilename(const char* filename) {
\r
108 m_filename = (char*)filename;
\r
111 // get BAM Index filename
\r
112 const char* BamReader::IndexFilename(void) const {
\r
113 return (const char*)m_indexFilename;
\r
116 // set BAM Index filename
\r
117 void BamReader::SetIndexFilename(const char* indexFilename) {
\r
118 m_indexFilename = (char*)indexFilename;
\r
121 // return full header text
\r
122 const string BamReader::GetHeaderText(void) const {
\r
123 return m_headerText;
\r
126 // return number of reference sequences in BAM file
\r
127 const int BamReader::GetReferenceCount(void) const {
\r
128 return m_references.size();
\r
131 // get RefID from reference name
\r
132 const int BamReader::GetRefID(string refName) const {
\r
134 vector<string> refNames;
\r
135 RefVector::const_iterator refIter = m_references.begin();
\r
136 RefVector::const_iterator refEnd = m_references.end();
\r
137 for ( ; refIter != refEnd; ++refIter) {
\r
138 refNames.push_back( (*refIter).RefName );
\r
141 // return 'index-of' refName (if not found, returns refNames.size())
\r
142 return Index( refNames.begin(), refNames.end(), refName );
\r
145 const RefVector BamReader::GetReferenceData(void) const {
\r
146 return m_references;
\r
149 bool BamReader::Jump(int refID, unsigned int left) {
\r
151 // if index available, and region is valid
\r
152 if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
\r
153 m_currentRefID = refID;
\r
154 m_currentLeft = left;
\r
155 m_isRegionSpecified = true;
\r
156 return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );
\r
161 bool BamReader::Rewind(void) {
\r
164 int refCount = m_references.size();
\r
165 for ( ; refID < refCount; ++refID ) {
\r
166 if ( m_references.at(refID).RefHasAlignments ) { break; }
\r
169 m_currentRefID = refID;
\r
171 m_isRegionSpecified = false;
\r
173 return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
\r
176 // get next alignment from specified region
\r
177 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
\r
179 // try to load 'next' read
\r
180 if ( LoadNextAlignment(bAlignment) ) {
\r
182 // if specified region, check for region overlap
\r
183 if ( m_isRegionSpecified ) {
\r
185 // if overlap, return true
\r
186 if ( IsOverlap(bAlignment) ) { return true; }
\r
187 // if not, try the next alignment
\r
188 else { return GetNextAlignment(bAlignment); }
\r
191 // not using region, valid read detected, return success
\r
192 else { return true; }
\r
195 // no valid alignment to load
\r
199 void BamReader::SetCalculateAlignedBases(bool flag) {
\r
200 m_isCalculateAlignedBases = flag;
\r
203 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
\r
205 // get region boundaries
\r
206 uint32_t begin = left;
\r
207 uint32_t end = m_references.at(refID).RefLength - 1;
\r
209 // initialize list, bin '0' always a valid bin
\r
213 // get rest of bins that contain this region
\r
215 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
\r
216 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
\r
217 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
\r
218 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
\r
219 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
\r
221 // return number of bins stored
\r
225 uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
\r
227 // initialize alignment end to starting position
\r
228 uint32_t alignEnd = position;
\r
230 // iterate over cigar operations
\r
231 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
\r
232 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
\r
233 for ( ; cigarIter != cigarEnd; ++cigarIter) {
\r
234 if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
\r
235 alignEnd += (*cigarIter).Length;
\r
241 uint64_t BamReader::GetOffset(int refID, unsigned int left) {
\r
243 // make space for bins
\r
244 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
\r
246 // returns number of bins overlapping (left, right)
\r
247 // stores indices of those bins in 'bins'
\r
248 int numBins = BinsFromRegion(refID, left, bins);
\r
250 // access index data for refID
\r
251 RefIndex* refIndex = m_index->at(refID);
\r
253 // get list of BAM bins for this reference sequence
\r
254 BinVector* refBins = refIndex->first;
\r
256 sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
\r
258 // declare ChunkVector
\r
259 ChunkVector regionChunks;
\r
261 // declaure LinearOffsetVector
\r
262 LinearOffsetVector* linearOffsets = refIndex->second;
\r
264 // some sort of linear offset vs bin index voodoo... not completely sure what's going here
\r
265 uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
\r
267 BinVector::iterator binBegin = refBins->begin();
\r
268 BinVector::iterator binEnd = refBins->end();
\r
270 // iterate over bins overlapping region, count chunks
\r
271 for (int i = 0; i < numBins; ++i) {
\r
273 // look for bin with ID=bin[i]
\r
274 BinVector::iterator binIter = binBegin;
\r
276 for ( ; binIter != binEnd; ++binIter ) {
\r
278 // if found, increment n_off by number of chunks for each bin
\r
279 if ( (*binIter).first == (uint32_t)bins[i] ) {
\r
281 // iterate over chunks in that bin
\r
282 ChunkVector* binChunks = (*binIter).second;
\r
284 ChunkVector::iterator chunkIter = binChunks->begin();
\r
285 ChunkVector::iterator chunkEnd = binChunks->end();
\r
286 for ( ; chunkIter != chunkEnd; ++chunkIter) {
\r
288 // if right bound of pair is greater than min_off (linear offset value), store pair
\r
289 if ( (*chunkIter).second > minOffset) {
\r
290 regionChunks.push_back( (*chunkIter) );
\r
297 // clean up temp array
\r
300 // there should be at least 1
\r
301 assert(regionChunks.size() > 0);
\r
303 // sort chunks by start position
\r
304 sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
\r
306 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
\r
307 int numOffsets = regionChunks.size();
\r
308 for (int i = 1; i < numOffsets; ++i) {
\r
309 if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
\r
310 regionChunks.at(i-1).second = regionChunks.at(i).first;
\r
314 // merge adjacent chunks
\r
316 for (int i = 1; i < numOffsets; ++i) {
\r
317 // if adjacent, expand boundaries of (merged) chunk
\r
318 if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
\r
319 regionChunks.at(l).second = regionChunks.at(i).second;
\r
321 // else, move on to next (merged) chunk index
\r
322 else { regionChunks.at(++l) = regionChunks.at(i); }
\r
325 // return beginning file offset of first chunk for region
\r
326 return regionChunks.at(0).first;
\r
329 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
\r
331 // if on different reference sequence, quit
\r
332 if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
\r
334 // read starts after left boundary
\r
335 if ( bAlignment.Position >= m_currentLeft) { return true; }
\r
337 // get alignment end
\r
338 uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
\r
340 // return whether alignment end overlaps left boundary
\r
341 return ( alignEnd >= m_currentLeft );
\r
344 bool BamReader::LoadHeader(void) {
\r
346 // check to see if proper BAM header
\r
348 if (bam_read(m_file, buf, 4) != 4) { return false; }
\r
349 if (strncmp(buf, "BAM\001", 4)) {
\r
350 fprintf(stderr, "wrong header type!\n");
\r
354 // get BAM header text length
\r
355 int32_t headerTextLength;
\r
356 bam_read(m_file, &headerTextLength, 4);
\r
358 // get BAM header text
\r
359 char* headerText = (char*)calloc(headerTextLength + 1, 1);
\r
360 bam_read(m_file, headerText, headerTextLength);
\r
361 m_headerText = (string)((const char*)headerText);
\r
363 // clean up calloc-ed temp variable
\r
366 // get number of reference sequences
\r
367 int32_t numberRefSeqs;
\r
368 bam_read(m_file, &numberRefSeqs, 4);
\r
369 if (numberRefSeqs == 0) { return false; }
\r
371 m_references.reserve((int)numberRefSeqs);
\r
373 // reference variables
\r
374 int32_t refNameLength;
\r
376 uint32_t refLength;
\r
378 // iterate over all references in header
\r
379 for (int i = 0; i != numberRefSeqs; ++i) {
\r
381 // get length of reference name
\r
382 bam_read(m_file, &refNameLength, 4);
\r
383 refName = (char*)calloc(refNameLength, 1);
\r
385 // get reference name and reference sequence length
\r
386 bam_read(m_file, refName, refNameLength);
\r
387 bam_read(m_file, &refLength, 4);
\r
389 // store data for reference
\r
390 RefData aReference;
\r
391 aReference.RefName = (string)((const char*)refName);
\r
392 aReference.RefLength = refLength;
\r
393 m_references.push_back(aReference);
\r
395 // clean up calloc-ed temp variable
\r
402 bool BamReader::LoadIndex(void) {
\r
404 // check to see if index file exists
\r
406 if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
\r
407 fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
\r
411 // see if index is valid BAM index
\r
413 fread(magic, 1, 4, indexFile);
\r
414 if (strncmp(magic, "BAI\1", 4)) {
\r
415 fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
\r
420 // get number of reference sequences
\r
421 uint32_t numRefSeqs;
\r
422 fread(&numRefSeqs, 4, 1, indexFile);
\r
424 // intialize BamIndex data structure
\r
425 m_index = new BamIndex;
\r
426 m_index->reserve(numRefSeqs);
\r
428 // iterate over reference sequences
\r
429 for (unsigned int i = 0; i < numRefSeqs; ++i) {
\r
431 // get number of bins for this reference sequence
\r
433 fread(&numBins, 4, 1, indexFile);
\r
435 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
\r
437 // intialize BinVector
\r
438 BinVector* bins = new BinVector;
\r
439 bins->reserve(numBins);
\r
441 // iterate over bins for that reference sequence
\r
442 for (int j = 0; j < numBins; ++j) {
\r
446 fread(&binID, 4, 1, indexFile);
\r
448 // get number of regionChunks in this bin
\r
449 uint32_t numChunks;
\r
450 fread(&numChunks, 4, 1, indexFile);
\r
452 // intialize ChunkVector
\r
453 ChunkVector* regionChunks = new ChunkVector;
\r
454 regionChunks->reserve(numChunks);
\r
456 // iterate over regionChunks in this bin
\r
457 for (unsigned int k = 0; k < numChunks; ++k) {
\r
459 // get chunk boundaries (left, right)
\r
462 fread(&left, 8, 1, indexFile);
\r
463 fread(&right, 8, 1, indexFile);
\r
466 regionChunks->push_back( ChunkPair(left, right) );
\r
469 // save binID, chunkVector for this bin
\r
470 bins->push_back( BamBin(binID, regionChunks) );
\r
473 // load linear index for this reference sequence
\r
475 // get number of linear offsets
\r
476 int32_t numLinearOffsets;
\r
477 fread(&numLinearOffsets, 4, 1, indexFile);
\r
479 // intialize LinearOffsetVector
\r
480 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
\r
481 linearOffsets->reserve(numLinearOffsets);
\r
483 // iterate over linear offsets for this reference sequeence
\r
484 for (int j = 0; j < numLinearOffsets; ++j) {
\r
485 // get a linear offset
\r
486 uint64_t linearOffset;
\r
487 fread(&linearOffset, 8, 1, indexFile);
\r
488 // store linear offset
\r
489 linearOffsets->push_back(linearOffset);
\r
492 // store index data for that reference sequence
\r
493 m_index->push_back( new RefIndex(bins, linearOffsets) );
\r
496 // close index file (.bai) and return
\r
501 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
\r
503 // check valid alignment block header data
\r
508 int32_t bytesRead = 0;
\r
510 // read in the 'block length' value, make sure it's not zero
\r
511 if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; }
\r
514 // read in core alignment data, make the right size of data was read
\r
515 if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
\r
516 bytesRead += BAM_CORE_SIZE;
\r
518 // set BamAlignment 'core' data
\r
519 bAlignment.RefID = x[0];
\r
520 bAlignment.Position = x[1];
\r
521 bAlignment.Bin = x[2]>>16;
\r
522 bAlignment.MapQuality = x[2]>>8&0xff;
\r
523 bAlignment.AlignmentFlag = x[3]>>16;
\r
524 bAlignment.MateRefID = x[5];
\r
525 bAlignment.MatePosition = x[6];
\r
526 bAlignment.InsertSize = x[7];
\r
528 // fetch & store often-used lengths for character data parsing
\r
529 unsigned int queryNameLength = x[2]&0xff;
\r
530 unsigned int numCigarOperations = x[3]&0xffff;
\r
531 unsigned int querySequenceLength = x[4];
\r
533 // get length of character data
\r
534 int dataLength = block_len - BAM_CORE_SIZE;
\r
536 // set up destination buffers for character data
\r
537 uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
\r
538 uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength);
\r
540 // get character data - make sure proper data size was read
\r
541 if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
\r
544 bytesRead += dataLength;
\r
546 // clear out bases, qualities, aligned bases, and CIGAR
\r
547 bAlignment.QueryBases.clear();
\r
548 bAlignment.Qualities.clear();
\r
549 bAlignment.AlignedBases.clear();
\r
550 bAlignment.CigarData.clear();
\r
553 bAlignment.Name = (string)((const char*)(allCharData));
\r
556 char singleBase[2];
\r
557 uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
\r
558 for (unsigned int i = 0; i < querySequenceLength; ++i) {
\r
559 // numeric to char conversion
\r
560 sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
\r
561 // append character data to Bases
\r
562 bAlignment.QueryBases.append( (const char*)singleBase );
\r
565 // save sequence length
\r
566 bAlignment.Length = bAlignment.QueryBases.length();
\r
569 char singleQuality[4];
\r
570 uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
\r
571 for (unsigned int i = 0; i < querySequenceLength; ++i) {
\r
572 // numeric to char conversion
\r
573 sprintf( singleQuality, "%c", ( t[i]+33 ) );
\r
574 // append character data to Qualities
\r
575 bAlignment.Qualities.append( (const char*)singleQuality );
\r
578 // save CIGAR-related data;
\r
580 for (unsigned int i = 0; i < numCigarOperations; ++i) {
\r
582 // build CigarOp struct
\r
584 op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
\r
585 op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
\r
588 bAlignment.CigarData.push_back(op);
\r
590 // can skip this step if user wants to ignore
\r
591 if (m_isCalculateAlignedBases) {
\r
593 // build AlignedBases string
\r
597 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
\r
598 case ('S') : k += op.Length; // for 'S' - skip over query bases
\r
602 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
\r
605 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
\r
609 case ('H') : break; // for 'H' - do nothing, move to next op
\r
611 default : assert(false); // shouldn't get here
\r
620 // (optional) read tag parsing data
\r
625 // still data to read (auxiliary tags)
\r
626 while ( bytesRead < block_len ) {
\r
628 if ( bam_read(m_file, &data, 1) == 1 ) {
\r
632 if (bytesRead == block_len && data != '\0') {
\r
633 fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
\r
637 tag.append(1, data);
\r
638 if ( data == '\0' ) {
\r
639 bAlignment.Tags.push_back(tag);
\r
643 if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
\r
647 fprintf(stderr, "ERROR: Could not read tag info.\n");
\r