]> git.donarmstrong.com Git - bamtools.git/blob - BamReader.cpp
Added tag data support.
[bamtools.git] / BamReader.cpp
1 // BamReader.cpp
2
3 // Derek Barnett
4 // Marth Lab, Boston College
5 // Last modified: 6 April 2009
6
7 #include "BamReader.h"
8 #include <iostream>
9 using std::cerr;
10 using std::endl;
11
12 // static character constants
13 const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
14 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
15
16 BamReader::BamReader(const char* filename, const char* indexFilename) 
17         : m_filename( (char*)filename )
18         , m_indexFilename( (char*)indexFilename )
19         , m_file(0)
20         , m_index(NULL)
21         , m_headerText("")
22         , m_isOpen(false)
23         , m_isIndexLoaded(false)
24         , m_isRegionSpecified(false)
25         , m_isCalculateAlignedBases(true)
26         , m_currentRefID(0)
27         , m_currentLeft(0)
28         , m_alignmentsBeginOffset(0)
29 {
30         Open();
31 }
32
33 BamReader::~BamReader(void) {
34
35         // close file
36         if ( m_isOpen ) { Close(); }
37 }
38
39 // open BAM file
40 bool BamReader::Open(void) {
41
42         if (!m_isOpen && m_filename != NULL ) {
43
44                 // open file
45                 m_file = bam_open(m_filename, "r"); 
46                 
47                 // get header info && index info
48                 if ( (m_file != NULL) && LoadHeader() ) {
49
50                         // save file offset where alignments start
51                         m_alignmentsBeginOffset = bam_tell(m_file);
52                         
53                         // set open flag
54                         m_isOpen = true;
55                 }
56
57                 // try to open (and load) index data, if index file given
58                 if ( m_indexFilename != NULL ) {
59                         OpenIndex();
60                 }
61         }
62
63         return m_isOpen;
64 }
65
66 bool BamReader::OpenIndex(void) {
67
68         if ( m_indexFilename && !m_isIndexLoaded ) {
69                 m_isIndexLoaded = LoadIndex();
70         }
71         return m_isIndexLoaded;
72 }
73
74 // close BAM file
75 bool BamReader::Close(void) {
76         
77         if (m_isOpen) {
78
79                 // close file
80                 int ret = bam_close(m_file);
81                 
82                 // delete index info
83                 if ( m_index != NULL) { delete m_index; }
84
85                 // clear open flag
86                 m_isOpen = false;
87
88                 // clear index flag
89                 m_isIndexLoaded = false;
90
91                 // clear region flag
92                 m_isRegionSpecified = false;
93
94                 // return success/fail of bam_close
95                 return (ret == 0);
96         } 
97
98         return true;
99 }
100
101 // get BAM filename
102 const char* BamReader::Filename(void) const { 
103         return (const char*)m_filename; 
104 }
105
106 // set BAM filename
107 void BamReader::SetFilename(const char* filename) {
108         m_filename = (char*)filename;
109 }
110
111 // get BAM Index filename
112 const char* BamReader::IndexFilename(void) const { 
113         return (const char*)m_indexFilename; 
114 }
115
116 // set BAM Index filename
117 void BamReader::SetIndexFilename(const char* indexFilename) {
118         m_indexFilename = (char*)indexFilename;
119 }
120
121 // return full header text
122 const string BamReader::GetHeaderText(void) const { 
123         return m_headerText; 
124 }
125
126 // return number of reference sequences in BAM file
127 const int BamReader::GetReferenceCount(void) const { 
128         return m_references.size();
129 }
130
131 // get RefID from reference name
132 const int BamReader::GetRefID(string refName) const { 
133         
134         vector<string> refNames;
135         RefVector::const_iterator refIter = m_references.begin();
136     RefVector::const_iterator refEnd  = m_references.end();
137     for ( ; refIter != refEnd; ++refIter) {
138                 refNames.push_back( (*refIter).RefName );
139     }
140
141         // return 'index-of' refName (if not found, returns refNames.size())
142         return Index( refNames.begin(), refNames.end(), refName );
143 }
144
145 const RefVector BamReader::GetReferenceData(void) const {
146         return m_references;
147 }
148
149 bool BamReader::Jump(int refID, unsigned int left) {
150
151         // if index available, and region is valid
152         if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
153                 m_currentRefID = refID;
154                 m_currentLeft  = left;
155                 m_isRegionSpecified = true;
156                 return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );
157         }
158         return false;
159 }
160
161 bool BamReader::Rewind(void) {
162
163         int refID = 0;
164         int refCount = m_references.size();
165         for ( ; refID < refCount; ++refID ) {
166                 if ( m_references.at(refID).RefHasAlignments ) { break; } 
167         }
168
169         m_currentRefID = refID;
170         m_currentLeft = 0;
171         m_isRegionSpecified = false;
172
173         return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
174 }       
175
176 // get next alignment from specified region
177 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
178
179         // try to load 'next' read
180         if ( LoadNextAlignment(bAlignment) ) {
181
182                 // if specified region, check for region overlap
183                 if ( m_isRegionSpecified ) {
184
185                         // if overlap, return true
186                         if ( IsOverlap(bAlignment) ) { return true; }
187                         // if not, try the next alignment
188                         else { return GetNextAlignment(bAlignment); }
189                 } 
190
191                 // not using region, valid read detected, return success
192                 else { return true; }
193         }
194
195         // no valid alignment to load
196         return false;
197 }
198
199 void BamReader::SetCalculateAlignedBases(bool flag) {
200         m_isCalculateAlignedBases = flag;
201 }
202
203 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
204
205         // get region boundaries
206         uint32_t begin = left;
207         uint32_t end   = m_references.at(refID).RefLength - 1;
208
209         // initialize list, bin '0' always a valid bin
210         int i = 0;
211         list[i++] = 0;
212
213         // get rest of bins that contain this region
214         unsigned int k;
215         for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
216         for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
217         for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
218         for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
219         for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
220         
221         // return number of bins stored
222         return i;
223 }
224
225 uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
226
227         // initialize alignment end to starting position
228         uint32_t alignEnd = position;
229
230         // iterate over cigar operations
231         vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
232         vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
233         for ( ; cigarIter != cigarEnd; ++cigarIter) {
234                 if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
235                         alignEnd += (*cigarIter).Length;
236                 }
237         }
238         return alignEnd;
239 }
240
241 uint64_t BamReader::GetOffset(int refID, unsigned int left) {
242
243         //  make space for bins
244         uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);                 
245         
246         // returns number of bins overlapping (left, right)
247         // stores indices of those bins in 'bins'
248         int numBins = BinsFromRegion(refID, left, bins);                                
249
250         // access index data for refID
251         RefIndex* refIndex = m_index->at(refID);
252
253         // get list of BAM bins for this reference sequence
254         BinVector* refBins = refIndex->first;
255
256         sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
257
258         // declare ChunkVector
259         ChunkVector regionChunks;
260
261         // declaure LinearOffsetVector
262         LinearOffsetVector* linearOffsets = refIndex->second;
263
264         // some sort of linear offset vs bin index voodoo... not completely sure what's going here
265         uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
266
267         BinVector::iterator binBegin = refBins->begin();
268         BinVector::iterator binEnd   = refBins->end();
269
270         // iterate over bins overlapping region, count chunks
271         for (int i = 0; i < numBins; ++i) {
272                 
273                 // look for bin with ID=bin[i]
274                 BinVector::iterator binIter = binBegin;
275
276                 for ( ; binIter != binEnd; ++binIter ) {
277                 
278                         // if found, increment n_off by number of chunks for each bin
279                         if ( (*binIter).first == (uint32_t)bins[i] ) { 
280                                 
281                                 // iterate over chunks in that bin
282                                 ChunkVector* binChunks = (*binIter).second;
283                                 
284                                 ChunkVector::iterator chunkIter = binChunks->begin();
285                                 ChunkVector::iterator chunkEnd  = binChunks->end();
286                                 for ( ; chunkIter != chunkEnd; ++chunkIter) {
287                                 
288                                         // if right bound of pair is greater than min_off (linear offset value), store pair
289                                         if ( (*chunkIter).second > minOffset) { 
290                                                 regionChunks.push_back( (*chunkIter) ); 
291                                         }
292                                 }
293                         }
294                 }
295         }
296
297         // clean up temp array
298         free(bins);
299
300         // there should be at least 1
301         assert(regionChunks.size() > 0);
302
303         // sort chunks by start position
304         sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
305
306         // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
307         int numOffsets = regionChunks.size();   
308         for (int i = 1; i < numOffsets; ++i) {
309                 if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
310                         regionChunks.at(i-1).second = regionChunks.at(i).first;
311                 }
312         }
313         
314         // merge adjacent chunks
315         int l = 0;
316         for (int i = 1; i < numOffsets; ++i) {
317                 // if adjacent, expand boundaries of (merged) chunk
318                 if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
319                         regionChunks.at(l).second = regionChunks.at(i).second;
320                 }
321                 // else, move on to next (merged) chunk index
322                 else { regionChunks.at(++l) = regionChunks.at(i); }
323         }
324
325         // return beginning file offset of first chunk for region
326         return regionChunks.at(0).first;
327 }
328
329 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
330
331         // if on different reference sequence, quit
332         if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
333
334         // read starts after left boundary
335         if ( bAlignment.Position >= m_currentLeft) { return true; }
336
337         // get alignment end
338         uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
339
340         // return whether alignment end overlaps left boundary
341         return ( alignEnd >= m_currentLeft );
342 }
343
344 bool BamReader::LoadHeader(void) {
345
346         // check to see if proper BAM header
347         char buf[4];
348         if (bam_read(m_file, buf, 4) != 4) { return false; }
349         if (strncmp(buf, "BAM\001", 4)) {
350                 fprintf(stderr, "wrong header type!\n");
351                 return false;
352         }
353         
354         // get BAM header text length
355         int32_t headerTextLength;
356         bam_read(m_file, &headerTextLength, 4);
357
358         // get BAM header text
359         char* headerText = (char*)calloc(headerTextLength + 1, 1);
360         bam_read(m_file, headerText, headerTextLength);
361         m_headerText = (string)((const char*)headerText);
362         
363         // clean up calloc-ed temp variable
364         free(headerText);
365
366         // get number of reference sequences
367         int32_t numberRefSeqs;
368         bam_read(m_file, &numberRefSeqs, 4);
369         if (numberRefSeqs == 0) { return false; }
370
371         m_references.reserve((int)numberRefSeqs);
372         
373         // reference variables
374         int32_t  refNameLength;
375         char*    refName;
376         uint32_t refLength;
377
378         // iterate over all references in header
379         for (int i = 0; i != numberRefSeqs; ++i) {
380
381                 // get length of reference name
382                 bam_read(m_file, &refNameLength, 4);
383                 refName = (char*)calloc(refNameLength, 1);
384
385                 // get reference name and reference sequence length
386                 bam_read(m_file, refName, refNameLength);
387                 bam_read(m_file, &refLength, 4);
388
389                 // store data for reference
390                 RefData aReference;
391                 aReference.RefName   = (string)((const char*)refName);
392                 aReference.RefLength = refLength;
393                 m_references.push_back(aReference);
394
395                 // clean up calloc-ed temp variable
396                 free(refName);
397         }
398         
399         return true;
400 }
401
402 bool BamReader::LoadIndex(void) {
403
404         // check to see if index file exists
405         FILE* indexFile;
406         if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
407                 fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
408                 return false;
409         }
410
411         // see if index is valid BAM index
412         char magic[4];
413         fread(magic, 1, 4, indexFile);
414         if (strncmp(magic, "BAI\1", 4)) {
415                 fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
416                 fclose(indexFile);
417                 return false;
418         }
419
420         // get number of reference sequences
421         uint32_t numRefSeqs;
422         fread(&numRefSeqs, 4, 1, indexFile);
423         
424         // intialize BamIndex data structure
425         m_index = new BamIndex;
426         m_index->reserve(numRefSeqs);
427
428         // iterate over reference sequences
429         for (unsigned int i = 0; i < numRefSeqs; ++i) {
430                 
431                 // get number of bins for this reference sequence
432                 int32_t numBins;
433                 fread(&numBins, 4, 1, indexFile);
434                 
435                 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
436
437                 // intialize BinVector
438                 BinVector* bins = new BinVector;
439                 bins->reserve(numBins);
440                 
441                 // iterate over bins for that reference sequence
442                 for (int j = 0; j < numBins; ++j) {
443                         
444                         // get binID 
445                         uint32_t binID;
446                         fread(&binID, 4, 1, indexFile);
447                         
448                         // get number of regionChunks in this bin
449                         uint32_t numChunks;
450                         fread(&numChunks, 4, 1, indexFile);
451                         
452                         // intialize ChunkVector
453                         ChunkVector* regionChunks = new ChunkVector;
454                         regionChunks->reserve(numChunks);
455                         
456                         // iterate over regionChunks in this bin
457                         for (unsigned int k = 0; k < numChunks; ++k) {
458                                 
459                                 // get chunk boundaries (left, right) 
460                                 uint64_t left;
461                                 uint64_t right;
462                                 fread(&left, 8, 1, indexFile);
463                                 fread(&right, 8, 1, indexFile);
464                                 
465                                 // save ChunkPair
466                                 regionChunks->push_back( ChunkPair(left, right) );
467                         }
468                         
469                         // save binID, chunkVector for this bin
470                         bins->push_back( BamBin(binID, regionChunks) );
471                 }
472                 
473                 // load linear index for this reference sequence
474                 
475                 // get number of linear offsets
476                 int32_t numLinearOffsets;
477                 fread(&numLinearOffsets, 4, 1, indexFile);
478                 
479                 // intialize LinearOffsetVector
480                 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
481                 linearOffsets->reserve(numLinearOffsets);
482                 
483                 // iterate over linear offsets for this reference sequeence
484                 for (int j = 0; j < numLinearOffsets; ++j) {
485                         // get a linear offset
486                         uint64_t linearOffset;
487                         fread(&linearOffset, 8, 1, indexFile);
488                         // store linear offset
489                         linearOffsets->push_back(linearOffset);
490                 }
491                 
492                 // store index data for that reference sequence
493                 m_index->push_back( new RefIndex(bins, linearOffsets) );
494         }
495         
496         // close index file (.bai) and return
497         fclose(indexFile);
498         return true;
499 }
500
501 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
502
503         // check valid alignment block header data
504         int32_t block_len;
505         int32_t ret;
506         uint32_t x[8];
507
508         int32_t bytesRead = 0;
509
510         // read in the 'block length' value, make sure it's not zero
511         if ( (ret = bam_read(m_file, &block_len, 4)) == 0 )        { return false; }
512         bytesRead += 4;
513
514         // read in core alignment data, make the right size of data was read 
515         if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
516         bytesRead += BAM_CORE_SIZE;
517
518         // set BamAlignment 'core' data
519         bAlignment.RefID         = x[0]; 
520         bAlignment.Position      = x[1];
521         bAlignment.Bin           = x[2]>>16; 
522         bAlignment.MapQuality    = x[2]>>8&0xff; 
523         bAlignment.AlignmentFlag = x[3]>>16; 
524         bAlignment.MateRefID     = x[5]; 
525         bAlignment.MatePosition  = x[6]; 
526         bAlignment.InsertSize    = x[7];
527
528         // fetch & store often-used lengths for character data parsing
529         unsigned int queryNameLength     = x[2]&0xff;
530         unsigned int numCigarOperations  = x[3]&0xffff;
531         unsigned int querySequenceLength = x[4];
532         
533         // get length of character data
534         int dataLength = block_len - BAM_CORE_SIZE;
535
536         // set up destination buffers for character data
537         uint8_t*  allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
538         uint32_t* cigarData   = (uint32_t*)(allCharData+queryNameLength);
539
540         const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
541         const unsigned int tagDataLen = dataLength - tagDataOffset;
542         char* tagData = ((char*)allCharData) + tagDataOffset;
543         
544         // get character data - make sure proper data size was read
545         if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
546         else {
547
548                 bytesRead += dataLength;
549
550                 // clear out bases, qualities, aligned bases, CIGAR, and tag data
551                 bAlignment.QueryBases.clear();
552                 bAlignment.Qualities.clear();
553                 bAlignment.AlignedBases.clear();
554                 bAlignment.CigarData.clear();
555                 bAlignment.TagData.clear();
556
557                 // save name
558                 bAlignment.Name = (string)((const char*)(allCharData));
559                 
560                 // save bases
561                 char singleBase[2];
562                 uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
563                 for (unsigned int i = 0; i < querySequenceLength; ++i) { 
564                         // numeric to char conversion
565                         sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
566                         // append character data to Bases
567                         bAlignment.QueryBases.append( (const char*)singleBase );
568                 }
569                 
570                 // save sequence length
571                 bAlignment.Length = bAlignment.QueryBases.length();
572                 
573                 // save qualities
574                 char singleQuality[4];
575                 uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
576                 for (unsigned int i = 0; i < querySequenceLength; ++i) { 
577                         // numeric to char conversion
578                         sprintf( singleQuality, "%c", ( t[i]+33 ) ); 
579                         // append character data to Qualities
580                         bAlignment.Qualities.append( (const char*)singleQuality );
581                 }
582                 
583                 // save CIGAR-related data;
584                 int k = 0;
585                 for (unsigned int i = 0; i < numCigarOperations; ++i) {
586
587                         // build CigarOp struct
588                         CigarOp op;
589                         op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
590                         op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
591
592                         // save CigarOp
593                         bAlignment.CigarData.push_back(op);
594
595                         // can skip this step if user wants to ignore
596                         if (m_isCalculateAlignedBases) {
597
598                                 // build AlignedBases string
599                                 switch (op.Type) {
600                                         
601                                         case ('M') : 
602                                         case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
603                                         case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
604                                                                  break;
605                                         
606                                         case ('D') : 
607                                         case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;
608                                                                  break;
609                                         
610                                         case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
611                                                                  k += op.Length;
612                                                                  break;
613
614                                         case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
615                                         
616                                         default    : assert(false);     // shouldn't get here
617                                                                  break;
618                                 }
619                         }
620                 }
621
622                 // read in the tag data
623                 bAlignment.TagData.resize(tagDataLen);
624                 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
625         }
626         free(allCharData);
627
628 /*
629         // (optional) read tag parsing data
630         string tag;
631         char data;
632         int i = 0;
633
634         // still data to read (auxiliary tags)
635         while ( bytesRead < block_len ) {
636
637                 if ( bam_read(m_file, &data, 1) == 1 ) { 
638                         
639                         ++bytesRead;
640
641                         if (bytesRead == block_len && data != '\0') {
642                                 fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
643                                 exit(1);
644                         }
645
646                         tag.append(1, data);
647                         if ( data == '\0' ) {
648                                 bAlignment.Tags.push_back(tag);
649                                 tag = "";
650                                 i = 0;
651                         } else {
652                                 if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
653                                 ++i;
654                         }
655                 } else {
656                         fprintf(stderr, "ERROR: Could not read tag info.\n");
657                         exit(1);
658                 }
659         }
660 */
661         return true;
662 }