]> git.donarmstrong.com Git - bamtools.git/blob - BamReader.cpp
f1c70611f3cc7c900b5301db92a43020ab518b97
[bamtools.git] / BamReader.cpp
1 // BamReader.cpp
2
3 // Derek Barnett
4 // Marth Lab, Boston College
5 // Last modified: 23 April 2009
6
7 #include "BamReader.h"
8 #include <iostream>
9 using std::cerr;
10 using std::endl;
11
12 // static character constants
13 const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
14 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
15
16 BamReader::BamReader(const char* filename, const char* indexFilename) 
17         : m_filename( (char*)filename )
18         , m_indexFilename( (char*)indexFilename )
19         , m_file(0)
20         , m_index(NULL)
21         , m_headerText("")
22         , m_isOpen(false)
23         , m_isIndexLoaded(false)
24         , m_isRegionSpecified(false)
25         , m_isCalculateAlignedBases(true)
26         , m_currentRefID(0)
27         , m_currentLeft(0)
28         , m_alignmentsBeginOffset(0)
29 {
30         Open();
31 }
32
33 BamReader::~BamReader(void) {
34
35         // close file
36         if ( m_isOpen ) { Close(); }
37 }
38
39 // open BAM file
40 bool BamReader::Open(void) {
41
42         if (!m_isOpen && m_filename != NULL ) {
43
44                 // open file
45                 m_file = bam_open(m_filename, "r"); 
46                 
47                 // get header info && index info
48                 if ( (m_file != NULL) && LoadHeader() ) {
49
50                         // save file offset where alignments start
51                         m_alignmentsBeginOffset = bam_tell(m_file);
52                         
53                         // set open flag
54                         m_isOpen = true;
55                 }
56
57                 // try to open (and load) index data, if index file given
58                 if ( m_indexFilename != NULL ) {
59                         OpenIndex();
60                 }
61         }
62
63         return m_isOpen;
64 }
65
66 bool BamReader::OpenIndex(void) {
67
68         if ( m_indexFilename && !m_isIndexLoaded ) {
69                 m_isIndexLoaded = LoadIndex();
70         }
71         return m_isIndexLoaded;
72 }
73
74 // close BAM file
75 bool BamReader::Close(void) {
76         
77         if (m_isOpen) {
78
79                 // close file
80                 int ret = bam_close(m_file);
81                 
82                 // delete index info
83                 if ( m_index != NULL) { delete m_index; }
84
85                 // clear open flag
86                 m_isOpen = false;
87
88                 // clear index flag
89                 m_isIndexLoaded = false;
90
91                 // clear region flag
92                 m_isRegionSpecified = false;
93
94                 // return success/fail of bam_close
95                 return (ret == 0);
96         } 
97
98         return true;
99 }
100
101 // get BAM filename
102 const char* BamReader::Filename(void) const { 
103         return (const char*)m_filename; 
104 }
105
106 // set BAM filename
107 void BamReader::SetFilename(const char* filename) {
108         m_filename = (char*)filename;
109 }
110
111 // get BAM Index filename
112 const char* BamReader::IndexFilename(void) const { 
113         return (const char*)m_indexFilename; 
114 }
115
116 // set BAM Index filename
117 void BamReader::SetIndexFilename(const char* indexFilename) {
118         m_indexFilename = (char*)indexFilename;
119 }
120
121 // return full header text
122 const string BamReader::GetHeaderText(void) const { 
123         return m_headerText; 
124 }
125
126 // return number of reference sequences in BAM file
127 const int BamReader::GetReferenceCount(void) const { 
128         return m_references.size();
129 }
130
131 // get RefID from reference name
132 const int BamReader::GetRefID(string refName) const { 
133         
134         vector<string> refNames;
135         RefVector::const_iterator refIter = m_references.begin();
136     RefVector::const_iterator refEnd  = m_references.end();
137     for ( ; refIter != refEnd; ++refIter) {
138                 refNames.push_back( (*refIter).RefName );
139     }
140
141         // return 'index-of' refName (if not found, returns refNames.size())
142         return Index( refNames.begin(), refNames.end(), refName );
143 }
144
145 const RefVector BamReader::GetReferenceData(void) const {
146         return m_references;
147 }
148
149 bool BamReader::Jump(int refID, unsigned int left) {
150
151         // if index available, and region is valid
152         if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
153                 m_currentRefID = refID;
154                 m_currentLeft  = left;
155                 m_isRegionSpecified = true;
156                 
157                 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
158                 if ( offset == -1 ) { return false; }
159                 else { return ( bam_seek(m_file, offset, SEEK_SET) == 0 ); }
160         }
161         return false;
162 }
163
164 bool BamReader::Rewind(void) {
165
166         int refID = 0;
167         int refCount = m_references.size();
168         for ( ; refID < refCount; ++refID ) {
169                 if ( m_references.at(refID).RefHasAlignments ) { break; } 
170         }
171
172         m_currentRefID = refID;
173         m_currentLeft = 0;
174         m_isRegionSpecified = false;
175
176         return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
177 }       
178
179 // get next alignment from specified region
180 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
181
182         // try to load 'next' read
183         if ( LoadNextAlignment(bAlignment) ) {
184
185                 // if specified region, check for region overlap
186                 if ( m_isRegionSpecified ) {
187
188                         // if overlap, return true
189                         if ( IsOverlap(bAlignment) ) { return true; }
190                         // if not, try the next alignment
191                         else { return GetNextAlignment(bAlignment); }
192                 } 
193
194                 // not using region, valid read detected, return success
195                 else { return true; }
196         }
197
198         // no valid alignment to load
199         return false;
200 }
201
202 void BamReader::SetCalculateAlignedBases(bool flag) {
203         m_isCalculateAlignedBases = flag;
204 }
205
206 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
207
208         // get region boundaries
209         uint32_t begin = left;
210         uint32_t end   = m_references.at(refID).RefLength - 1;
211
212         // initialize list, bin '0' always a valid bin
213         int i = 0;
214         list[i++] = 0;
215
216         // get rest of bins that contain this region
217         unsigned int k;
218         for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
219         for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
220         for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
221         for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
222         for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
223         
224         // return number of bins stored
225         return i;
226 }
227
228 uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
229
230         // initialize alignment end to starting position
231         uint32_t alignEnd = position;
232
233         // iterate over cigar operations
234         vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
235         vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
236         for ( ; cigarIter != cigarEnd; ++cigarIter) {
237                 if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
238                         alignEnd += (*cigarIter).Length;
239                 }
240         }
241         return alignEnd;
242 }
243
244 int64_t BamReader::GetOffset(int refID, unsigned int left) {
245
246         //  make space for bins
247         uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);                 
248         
249         // returns number of bins overlapping (left, right)
250         // stores indices of those bins in 'bins'
251         int numBins = BinsFromRegion(refID, left, bins);                                
252
253         // access index data for refID
254         RefIndex* refIndex = m_index->at(refID);
255
256         // get list of BAM bins for this reference sequence
257         BinVector* refBins = refIndex->first;
258
259         sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
260
261         // declare ChunkVector
262         ChunkVector regionChunks;
263
264         // declaure LinearOffsetVector
265         LinearOffsetVector* linearOffsets = refIndex->second;
266
267         // some sort of linear offset vs bin index voodoo... not completely sure what's going here
268         uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
269
270         BinVector::iterator binBegin = refBins->begin();
271         BinVector::iterator binEnd   = refBins->end();
272
273         // iterate over bins overlapping region, count chunks
274         for (int i = 0; i < numBins; ++i) {
275                 
276                 // look for bin with ID=bin[i]
277                 BinVector::iterator binIter = binBegin;
278
279                 for ( ; binIter != binEnd; ++binIter ) {
280                 
281                         // if found, increment n_off by number of chunks for each bin
282                         if ( (*binIter).first == (uint32_t)bins[i] ) { 
283                                 
284                                 // iterate over chunks in that bin
285                                 ChunkVector* binChunks = (*binIter).second;
286                                 
287                                 ChunkVector::iterator chunkIter = binChunks->begin();
288                                 ChunkVector::iterator chunkEnd  = binChunks->end();
289                                 for ( ; chunkIter != chunkEnd; ++chunkIter) {
290                                 
291                                         // if right bound of pair is greater than min_off (linear offset value), store pair
292                                         if ( (*chunkIter).second > minOffset) { 
293                                                 regionChunks.push_back( (*chunkIter) ); 
294                                         }
295                                 }
296                         }
297                 }
298         }
299
300         // clean up temp array
301         free(bins);
302
303         // there should be at least 1
304         if(regionChunks.size() == 0) { return -1; }
305
306         // sort chunks by start position
307         sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
308
309         // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
310         int numOffsets = regionChunks.size();   
311         for (int i = 1; i < numOffsets; ++i) {
312                 if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
313                         regionChunks.at(i-1).second = regionChunks.at(i).first;
314                 }
315         }
316         
317         // merge adjacent chunks
318         int l = 0;
319         for (int i = 1; i < numOffsets; ++i) {
320                 // if adjacent, expand boundaries of (merged) chunk
321                 if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
322                         regionChunks.at(l).second = regionChunks.at(i).second;
323                 }
324                 // else, move on to next (merged) chunk index
325                 else { regionChunks.at(++l) = regionChunks.at(i); }
326         }
327
328         // return beginning file offset of first chunk for region
329         return (int64_t)regionChunks.at(0).first;
330 }
331
332 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
333
334         // if on different reference sequence, quit
335         if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
336
337         // read starts after left boundary
338         if ( bAlignment.Position >= m_currentLeft) { return true; }
339
340         // get alignment end
341         uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
342
343         // return whether alignment end overlaps left boundary
344         return ( alignEnd >= m_currentLeft );
345 }
346
347 bool BamReader::LoadHeader(void) {
348
349         // check to see if proper BAM header
350         char buf[4];
351         if (bam_read(m_file, buf, 4) != 4) { return false; }
352         if (strncmp(buf, "BAM\001", 4)) {
353                 fprintf(stderr, "wrong header type!\n");
354                 return false;
355         }
356         
357         // get BAM header text length
358         int32_t headerTextLength;
359         bam_read(m_file, &headerTextLength, 4);
360
361         // get BAM header text
362         char* headerText = (char*)calloc(headerTextLength + 1, 1);
363         bam_read(m_file, headerText, headerTextLength);
364         m_headerText = (string)((const char*)headerText);
365         
366         // clean up calloc-ed temp variable
367         free(headerText);
368
369         // get number of reference sequences
370         int32_t numberRefSeqs;
371         bam_read(m_file, &numberRefSeqs, 4);
372         if (numberRefSeqs == 0) { return false; }
373
374         m_references.reserve((int)numberRefSeqs);
375         
376         // reference variables
377         int32_t  refNameLength;
378         char*    refName;
379         uint32_t refLength;
380
381         // iterate over all references in header
382         for (int i = 0; i != numberRefSeqs; ++i) {
383
384                 // get length of reference name
385                 bam_read(m_file, &refNameLength, 4);
386                 refName = (char*)calloc(refNameLength, 1);
387
388                 // get reference name and reference sequence length
389                 bam_read(m_file, refName, refNameLength);
390                 bam_read(m_file, &refLength, 4);
391
392                 // store data for reference
393                 RefData aReference;
394                 aReference.RefName   = (string)((const char*)refName);
395                 aReference.RefLength = refLength;
396                 m_references.push_back(aReference);
397
398                 // clean up calloc-ed temp variable
399                 free(refName);
400         }
401         
402         return true;
403 }
404
405 bool BamReader::LoadIndex(void) {
406
407         // check to see if index file exists
408         FILE* indexFile;
409         if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
410                 fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
411                 return false;
412         }
413
414         // see if index is valid BAM index
415         char magic[4];
416         fread(magic, 1, 4, indexFile);
417         if (strncmp(magic, "BAI\1", 4)) {
418                 fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
419                 fclose(indexFile);
420                 return false;
421         }
422
423         // get number of reference sequences
424         uint32_t numRefSeqs;
425         fread(&numRefSeqs, 4, 1, indexFile);
426         
427         // intialize BamIndex data structure
428         m_index = new BamIndex;
429         m_index->reserve(numRefSeqs);
430
431         // iterate over reference sequences
432         for (unsigned int i = 0; i < numRefSeqs; ++i) {
433                 
434                 // get number of bins for this reference sequence
435                 int32_t numBins;
436                 fread(&numBins, 4, 1, indexFile);
437                 
438                 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
439
440                 // intialize BinVector
441                 BinVector* bins = new BinVector;
442                 bins->reserve(numBins);
443                 
444                 // iterate over bins for that reference sequence
445                 for (int j = 0; j < numBins; ++j) {
446                         
447                         // get binID 
448                         uint32_t binID;
449                         fread(&binID, 4, 1, indexFile);
450                         
451                         // get number of regionChunks in this bin
452                         uint32_t numChunks;
453                         fread(&numChunks, 4, 1, indexFile);
454                         
455                         // intialize ChunkVector
456                         ChunkVector* regionChunks = new ChunkVector;
457                         regionChunks->reserve(numChunks);
458                         
459                         // iterate over regionChunks in this bin
460                         for (unsigned int k = 0; k < numChunks; ++k) {
461                                 
462                                 // get chunk boundaries (left, right) 
463                                 uint64_t left;
464                                 uint64_t right;
465                                 fread(&left, 8, 1, indexFile);
466                                 fread(&right, 8, 1, indexFile);
467                                 
468                                 // save ChunkPair
469                                 regionChunks->push_back( ChunkPair(left, right) );
470                         }
471                         
472                         // save binID, chunkVector for this bin
473                         bins->push_back( BamBin(binID, regionChunks) );
474                 }
475                 
476                 // load linear index for this reference sequence
477                 
478                 // get number of linear offsets
479                 int32_t numLinearOffsets;
480                 fread(&numLinearOffsets, 4, 1, indexFile);
481                 
482                 // intialize LinearOffsetVector
483                 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
484                 linearOffsets->reserve(numLinearOffsets);
485                 
486                 // iterate over linear offsets for this reference sequeence
487                 for (int j = 0; j < numLinearOffsets; ++j) {
488                         // get a linear offset
489                         uint64_t linearOffset;
490                         fread(&linearOffset, 8, 1, indexFile);
491                         // store linear offset
492                         linearOffsets->push_back(linearOffset);
493                 }
494                 
495                 // store index data for that reference sequence
496                 m_index->push_back( new RefIndex(bins, linearOffsets) );
497         }
498         
499         // close index file (.bai) and return
500         fclose(indexFile);
501         return true;
502 }
503
504 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
505
506         // check valid alignment block header data
507         int32_t block_len;
508         int32_t ret;
509         uint32_t x[8];
510
511         int32_t bytesRead = 0;
512
513         // read in the 'block length' value, make sure it's not zero
514         if ( (ret = bam_read(m_file, &block_len, 4)) == 0 )        { return false; }
515         bytesRead += 4;
516
517         // read in core alignment data, make the right size of data was read 
518         if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
519         bytesRead += BAM_CORE_SIZE;
520
521         // set BamAlignment 'core' data
522         bAlignment.RefID         = x[0]; 
523         bAlignment.Position      = x[1];
524         bAlignment.Bin           = x[2]>>16; 
525         bAlignment.MapQuality    = x[2]>>8&0xff; 
526         bAlignment.AlignmentFlag = x[3]>>16; 
527         bAlignment.MateRefID     = x[5]; 
528         bAlignment.MatePosition  = x[6]; 
529         bAlignment.InsertSize    = x[7];
530
531         // fetch & store often-used lengths for character data parsing
532         unsigned int queryNameLength     = x[2]&0xff;
533         unsigned int numCigarOperations  = x[3]&0xffff;
534         unsigned int querySequenceLength = x[4];
535         
536         // get length of character data
537         int dataLength = block_len - BAM_CORE_SIZE;
538
539         // set up destination buffers for character data
540         uint8_t*  allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
541         uint32_t* cigarData   = (uint32_t*)(allCharData+queryNameLength);
542
543         const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
544         const unsigned int tagDataLen = dataLength - tagDataOffset;
545         char* tagData = ((char*)allCharData) + tagDataOffset;
546         
547         // get character data - make sure proper data size was read
548         if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
549         else {
550
551                 bytesRead += dataLength;
552
553                 // clear out bases, qualities, aligned bases, CIGAR, and tag data
554                 bAlignment.QueryBases.clear();
555                 bAlignment.Qualities.clear();
556                 bAlignment.AlignedBases.clear();
557                 bAlignment.CigarData.clear();
558                 bAlignment.TagData.clear();
559
560                 // save name
561                 bAlignment.Name = (string)((const char*)(allCharData));
562                 
563                 // save bases
564                 char singleBase[2];
565                 uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
566                 for (unsigned int i = 0; i < querySequenceLength; ++i) { 
567                         // numeric to char conversion
568                         sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
569                         // append character data to Bases
570                         bAlignment.QueryBases.append( (const char*)singleBase );
571                 }
572                 
573                 // save sequence length
574                 bAlignment.Length = bAlignment.QueryBases.length();
575                 
576                 // save qualities
577                 char singleQuality[4];
578                 uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
579                 for (unsigned int i = 0; i < querySequenceLength; ++i) { 
580                         // numeric to char conversion
581                         sprintf( singleQuality, "%c", ( t[i]+33 ) ); 
582                         // append character data to Qualities
583                         bAlignment.Qualities.append( (const char*)singleQuality );
584                 }
585                 
586                 // save CIGAR-related data;
587                 int k = 0;
588                 for (unsigned int i = 0; i < numCigarOperations; ++i) {
589
590                         // build CigarOp struct
591                         CigarOp op;
592                         op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
593                         op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
594
595                         // save CigarOp
596                         bAlignment.CigarData.push_back(op);
597
598                         // can skip this step if user wants to ignore
599                         if (m_isCalculateAlignedBases) {
600
601                                 // build AlignedBases string
602                                 switch (op.Type) {
603                                         
604                                         case ('M') : 
605                                         case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
606                                         case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
607                                                                  break;
608                                         
609                                         case ('D') : 
610                                         case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;
611                                                                  break;
612                                         
613                                         case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
614                                                                  k += op.Length;
615                                                                  break;
616
617                                         case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
618                                         
619                                         default    : assert(false);     // shouldn't get here
620                                                                  break;
621                                 }
622                         }
623                 }
624
625                 // read in the tag data
626                 bAlignment.TagData.resize(tagDataLen);
627                 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
628         }
629         free(allCharData);
630
631 /*
632         // (optional) read tag parsing data
633         string tag;
634         char data;
635         int i = 0;
636
637         // still data to read (auxiliary tags)
638         while ( bytesRead < block_len ) {
639
640                 if ( bam_read(m_file, &data, 1) == 1 ) { 
641                         
642                         ++bytesRead;
643
644                         if (bytesRead == block_len && data != '\0') {
645                                 fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
646                                 exit(1);
647                         }
648
649                         tag.append(1, data);
650                         if ( data == '\0' ) {
651                                 bAlignment.Tags.push_back(tag);
652                                 tag = "";
653                                 i = 0;
654                         } else {
655                                 if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
656                                 ++i;
657                         }
658                 } else {
659                         fprintf(stderr, "ERROR: Could not read tag info.\n");
660                         exit(1);
661                 }
662         }
663 */
664         return true;
665 }