]> git.donarmstrong.com Git - bamtools.git/blob - BamReader.cpp
Fix of signed/unsigned in BamReader methods
[bamtools.git] / BamReader.cpp
1 // ***************************************************************************
2 // BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 15 July 2009 (DB)
7 // ---------------------------------------------------------------------------
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
9 // Institute.
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for reading BAM files
12 // ***************************************************************************
13
14 // BamTools includes
15 #include "BamReader.h"
16 using namespace BamTools;
17 using namespace std;
18
19 // static character constants
20 const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
21 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
22
23 // constructor
24 BamReader::BamReader(void)
25         : m_BGZF(NULL)
26         , m_index(NULL)
27         , m_isIndexLoaded(false)
28         , m_alignmentsBeginOffset(0)
29         , m_isRegionSpecified(false)
30         , m_currentRefID(0)
31         , m_currentLeft(0)
32 { }
33
34 // destructor
35 BamReader::~BamReader(void) {
36         Close();
37 }
38
39 // checks BGZF block header
40 bool BamReader::BgzfCheckBlockHeader(char* header) {
41
42         return (header[0] == GZIP_ID1 &&
43             header[1] == (char)GZIP_ID2 &&
44             header[2] == Z_DEFLATED &&
45             (header[3] & FLG_FEXTRA) != 0 &&
46             BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
47             header[12] == BGZF_ID1 &&
48             header[13] == BGZF_ID2 &&
49             BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
50                    );
51 }
52
53 // closes the BAM file
54 void BamReader::BgzfClose(void) {
55         fflush(m_BGZF->Stream);
56         fclose(m_BGZF->Stream);
57         m_BGZF->IsOpen = false;
58 }
59
60 // de-compresses the current block
61 int BamReader::BgzfInflateBlock(int blockLength) {
62         
63         // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
64     z_stream zs;
65     zs.zalloc    = NULL;
66     zs.zfree     = NULL;
67     zs.next_in   = (Bytef*)m_BGZF->CompressedBlock + 18;
68     zs.avail_in  = blockLength - 16;
69     zs.next_out  = (Bytef*)m_BGZF->UncompressedBlock;
70     zs.avail_out = m_BGZF->UncompressedBlockSize;
71
72     int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
73     if (status != Z_OK) {
74         printf("inflateInit failed\n");
75         exit(1);
76     }
77
78     status = inflate(&zs, Z_FINISH);
79     if (status != Z_STREAM_END) {
80         inflateEnd(&zs);
81         printf("inflate failed\n");
82         exit(1);
83     }
84
85     status = inflateEnd(&zs);
86     if (status != Z_OK) {
87         printf("inflateEnd failed\n");
88         exit(1);
89     }
90
91     return zs.total_out;
92 }
93
94 // opens the BAM file for reading
95 void BamReader::BgzfOpen(const string& filename) {
96
97         m_BGZF->Stream = fopen(filename.c_str(), "rb");
98         if(!m_BGZF->Stream) {
99                 printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
100                 exit(1);
101         }
102
103         m_BGZF->IsOpen = true;
104 }
105
106 // reads BGZF data into buffer
107 unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
108
109     if (dataLength == 0) { return 0; }
110
111         char* output = data;
112         unsigned int numBytesRead = 0;
113         while (numBytesRead < dataLength) {
114
115         int bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
116         if (bytesAvailable <= 0) {
117             if ( BgzfReadBlock() != 0 ) { return -1; }
118             bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
119             if ( bytesAvailable <= 0 ) { break; }
120         }
121
122         char* buffer   = m_BGZF->UncompressedBlock;
123         int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
124         memcpy(output, buffer + m_BGZF->BlockOffset, copyLength);
125
126         m_BGZF->BlockOffset += copyLength;
127         output             += copyLength;
128         numBytesRead       += copyLength;
129     }
130
131     if ( m_BGZF->BlockOffset == m_BGZF->BlockLength ) {
132         m_BGZF->BlockAddress = ftello(m_BGZF->Stream);                                          
133         m_BGZF->BlockOffset  = 0;
134         m_BGZF->BlockLength  = 0;
135     }
136
137         return numBytesRead;
138 }
139
140 int BamReader::BgzfReadBlock(void) {
141
142     char    header[BLOCK_HEADER_LENGTH];
143     int64_t blockAddress = ftello(m_BGZF->Stream);
144
145     int count = fread(header, 1, sizeof(header), m_BGZF->Stream);
146         if (count == 0) {
147         m_BGZF->BlockLength = 0;
148         return 0;
149     }
150
151     if (count != sizeof(header)) {
152         printf("read block failed - count != sizeof(header)\n");
153         return -1;
154     }
155
156     if (!BgzfCheckBlockHeader(header)) {
157         printf("read block failed - CheckBgzfBlockHeader() returned false\n");
158         return -1;
159     }
160
161     int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
162     char* compressedBlock = m_BGZF->CompressedBlock;
163     memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
164     int remaining = blockLength - BLOCK_HEADER_LENGTH;
165
166     count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF->Stream);
167     if (count != remaining) {
168         printf("read block failed - count != remaining\n");
169         return -1;
170     }
171
172     count = BgzfInflateBlock(blockLength);
173     if (count < 0) { return -1; }
174
175     if (m_BGZF->BlockLength != 0) {
176         m_BGZF->BlockOffset = 0;
177     }
178
179     m_BGZF->BlockAddress = blockAddress;
180     m_BGZF->BlockLength  = count;
181     return 0;
182 }
183
184 // move file pointer to specified offset
185 bool BamReader::BgzfSeek(int64_t position) {
186
187         int     blockOffset  = (position & 0xFFFF);
188         int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
189         if (fseeko(m_BGZF->Stream, blockAddress, SEEK_SET) != 0) {
190         printf("ERROR: Unable to seek in BAM file\n");
191                 exit(1);
192         }
193
194         m_BGZF->BlockLength  = 0;
195         m_BGZF->BlockAddress = blockAddress;
196         m_BGZF->BlockOffset  = blockOffset;
197         return true;
198 }
199
200 // get file position in BAM file
201 int64_t BamReader::BgzfTell(void) {
202         return ( (m_BGZF->BlockAddress << 16) | (m_BGZF->BlockOffset & 0xFFFF) );
203 }
204
205 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
206
207         // get region boundaries
208         uint32_t begin = left;
209         uint32_t end   = m_references.at(refID).RefLength - 1;
210
211         // initialize list, bin '0' always a valid bin
212         int i = 0;
213         list[i++] = 0;
214
215         // get rest of bins that contain this region
216         unsigned int k;
217         for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
218         for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
219         for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
220         for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
221         for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
222         
223         // return number of bins stored
224         return i;
225 }
226
227 unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
228
229         // initialize alignment end to starting position
230         unsigned int alignEnd = position;
231
232         // iterate over cigar operations
233         vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
234         vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
235         for ( ; cigarIter != cigarEnd; ++cigarIter) {
236                 char cigarType = (*cigarIter).Type;
237                 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
238                         alignEnd += (*cigarIter).Length;
239                 }
240         }
241         return alignEnd;
242 }
243
244 void BamReader::ClearIndex(void) {
245
246         if ( m_index ) {
247                 // iterate over references
248                 vector<RefIndex*>::iterator refIter = m_index->begin();
249                 vector<RefIndex*>::iterator refEnd  = m_index->end();
250                 for ( ; refIter != refEnd; ++refIter) {
251                         RefIndex* aRef = (*refIter);
252                         if ( aRef ) {
253                                 // clear out BAM bins
254                                 if ( aRef->first ) {
255                                         BinVector::iterator binIter = (aRef->first)->begin();
256                                         BinVector::iterator binEnd  = (aRef->first)->end();
257                                         for ( ; binIter != binEnd; ++binIter ) {
258                                                 ChunkVector* chunks = (*binIter).second;
259                                                 if ( chunks ) { delete chunks; chunks = NULL;}
260                                         }
261                                         delete aRef->first;
262                                         aRef->first = NULL;
263                                 }
264                                 // clear BAM linear offsets
265                                 if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
266                                 delete aRef;
267                                 aRef = NULL;
268                         }
269                 }
270                 delete m_index;
271                 m_index = NULL;
272         }
273 }
274
275 // closes the BAM file
276 void BamReader::Close(void) {
277         
278         if (m_BGZF!=NULL && m_BGZF->IsOpen) { 
279                 BgzfClose();
280                 delete m_BGZF;
281                 m_BGZF = NULL; 
282         }       
283         ClearIndex();
284         m_headerText.clear();
285         m_isRegionSpecified = false;
286 }
287
288 const string BamReader::GetHeaderText(void) const {
289         return m_headerText;
290 }
291
292 const int BamReader::GetReferenceCount(void) const {
293         return m_references.size();
294 }
295
296 const RefVector BamReader::GetReferenceData(void) const {
297         return m_references;
298 }
299
300 const int BamReader::GetReferenceID(const string& refName) const {
301
302         // retrieve names from reference data
303         vector<string> refNames;
304         RefVector::const_iterator refIter = m_references.begin();
305     RefVector::const_iterator refEnd  = m_references.end();
306     for ( ; refIter != refEnd; ++refIter) {
307                 refNames.push_back( (*refIter).RefName );
308     }
309
310         // return 'index-of' refName ( if not found, returns refNames.size() )
311         return Index( refNames.begin(), refNames.end(), refName );
312 }
313
314 // get next alignment (from specified region, if given)
315 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
316
317         // if valid alignment available
318         if ( LoadNextAlignment(bAlignment) ) {
319                 
320                 // if region not specified, return success
321                 if ( !m_isRegionSpecified ) { return true; }
322                 
323                 // load next alignment until region overlap is found
324                 while ( !IsOverlap(bAlignment) ) {
325                         // if no valid alignment available (likely EOF) return failure
326                         if ( !LoadNextAlignment(bAlignment) ) { return false; }
327                 }
328                 
329                 // return success (alignment found that overlaps region)
330                 return true;
331         } 
332         
333         // no valid alignment
334         else { return false; }
335 }
336
337 int64_t BamReader::GetOffset(int refID, unsigned int left) {
338
339         // calculate which bins overlap this region
340         uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
341         int numBins = BinsFromRegion(refID, left, bins);
342
343         // get bins for this reference
344         RefIndex* refIndex = m_index->at(refID);
345         BinVector* refBins = refIndex->first;
346
347         // get minimum offset to consider
348         LinearOffsetVector* linearOffsets = refIndex->second;
349         uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
350
351         // store offsets to beginning of alignment 'chunks' 
352         std::vector<int64_t> chunkStarts;
353         
354         // reference bin iterators
355         BinVector::const_iterator binIter;
356         BinVector::const_iterator binBegin = refBins->begin();
357         BinVector::const_iterator binEnd   = refBins->end();
358         
359         // store all alignment 'chunk' starts for bins in this region
360         for (int i = 0; i < numBins; ++i ) {
361                 binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
362                 if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
363                         ChunkVector* binChunks = (*binIter).second;
364                         ChunkVector::const_iterator chunkIter = binChunks->begin();
365                         ChunkVector::const_iterator chunkEnd  = binChunks->end();
366                         for ( ; chunkIter != chunkEnd; ++chunkIter) {
367                                 if ( (*chunkIter).second > minOffset ) {
368                                         chunkStarts.push_back( (*chunkIter).first );
369                                 }       
370                         }
371                 }
372         }
373         
374         // clean up memory
375         free(bins);
376         
377         // if no alignments found
378         if ( chunkStarts.empty() ) { return -1; }
379         
380         // else return smallest offset for alignment starts 
381         else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
382 }
383
384 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
385
386         // if on different reference sequence, quit
387         if ( bAlignment.RefID != m_currentRefID ) { return false; }
388
389         // read starts after left boundary
390         if ( bAlignment.Position >= (signed int) m_currentLeft) { return true; }
391
392         // return whether alignment end overlaps left boundary
393         return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
394 }
395
396 bool BamReader::Jump(int refID, unsigned int position) {
397
398         // if index available, and region is valid
399         if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) { 
400                 m_currentRefID = refID;
401                 m_currentLeft  = position;
402                 m_isRegionSpecified = true;
403                 
404                 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
405                 if ( offset == -1 ) { return false; }
406                 else { return BgzfSeek(offset); }
407         }
408         return false;
409 }
410
411 void BamReader::LoadHeaderData(void) {
412         
413         // check to see if proper BAM header
414         char buffer[4];
415         if (BgzfRead(buffer, 4) != 4) { 
416                 printf("Could not read header type\n");
417                 exit(1); 
418         }
419
420         if (strncmp(buffer, "BAM\001", 4)) {
421                 printf("wrong header type!\n");
422                 exit(1);
423         }
424         
425         // get BAM header text length
426         BgzfRead(buffer, 4);
427         const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
428
429         // get BAM header text
430         char* headerText = (char*)calloc(headerTextLength + 1, 1);
431         BgzfRead(headerText, headerTextLength);
432         m_headerText = (string)((const char*)headerText);
433         
434         // clean up calloc-ed temp variable
435         free(headerText);
436 }
437
438 void BamReader::LoadIndexData(FILE* indexStream) {
439
440         // see if index is valid BAM index
441         char magic[4];
442         fread(magic, 1, 4, indexStream);
443         if (strncmp(magic, "BAI\1", 4)) {
444                 printf("Problem with index file - invalid format.\n");
445                 fclose(indexStream);
446                 exit(1);
447         }
448
449         // get number of reference sequences
450         uint32_t numRefSeqs;
451         fread(&numRefSeqs, 4, 1, indexStream);
452         
453         // intialize BamIndex data structure
454         m_index = new BamIndex;
455         m_index->reserve(numRefSeqs);
456
457         // iterate over reference sequences
458         for (unsigned int i = 0; i < numRefSeqs; ++i) {
459                 
460                 // get number of bins for this reference sequence
461                 int32_t numBins;
462                 fread(&numBins, 4, 1, indexStream);
463                 
464                 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
465
466                 // intialize BinVector
467                 BinVector* bins = new BinVector;
468                 bins->reserve(numBins);
469                 
470                 // iterate over bins for that reference sequence
471                 for (int j = 0; j < numBins; ++j) {
472                         
473                         // get binID 
474                         uint32_t binID;
475                         fread(&binID, 4, 1, indexStream);
476                         
477                         // get number of regionChunks in this bin
478                         uint32_t numChunks;
479                         fread(&numChunks, 4, 1, indexStream);
480                         
481                         // intialize ChunkVector
482                         ChunkVector* regionChunks = new ChunkVector;
483                         regionChunks->reserve(numChunks);
484                         
485                         // iterate over regionChunks in this bin
486                         for (unsigned int k = 0; k < numChunks; ++k) {
487                                 
488                                 // get chunk boundaries (left, right) 
489                                 uint64_t left;
490                                 uint64_t right;
491                                 fread(&left, 8, 1, indexStream);
492                                 fread(&right, 8, 1, indexStream);
493                                 
494                                 // save ChunkPair
495                                 regionChunks->push_back( ChunkPair(left, right) );
496                         }
497                         
498                         // sort chunks for this bin
499                         sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
500
501                         // save binID, chunkVector for this bin
502                         bins->push_back( BamBin(binID, regionChunks) );
503                 }
504                 
505                 // sort bins by binID
506                 sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
507
508                 // load linear index for this reference sequence
509                 
510                 // get number of linear offsets
511                 int32_t numLinearOffsets;
512                 fread(&numLinearOffsets, 4, 1, indexStream);
513                 
514                 // intialize LinearOffsetVector
515                 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
516                 linearOffsets->reserve(numLinearOffsets);
517                 
518                 // iterate over linear offsets for this reference sequeence
519                 for (int j = 0; j < numLinearOffsets; ++j) {
520                         // get a linear offset
521                         uint64_t linearOffset;
522                         fread(&linearOffset, 8, 1, indexStream);
523                         // store linear offset
524                         linearOffsets->push_back(linearOffset);
525                 }
526                 
527                 // sort linear offsets
528                 sort( linearOffsets->begin(), linearOffsets->end() );
529
530                 // store index data for that reference sequence
531                 m_index->push_back( new RefIndex(bins, linearOffsets) );
532         }
533         
534         // close index file (.bai) and return
535         fclose(indexStream);
536 }
537
538 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
539
540         // read in the 'block length' value, make sure it's not zero
541         char buffer[4];
542         BgzfRead(buffer, 4);
543         const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
544         if ( blockLength == 0 ) { return false; }
545
546         // keep track of bytes read as method progresses
547         int bytesRead = 4;
548
549         // read in core alignment data, make sure the right size of data was read 
550         char x[BAM_CORE_SIZE];
551         if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
552         bytesRead += BAM_CORE_SIZE;
553
554         // set BamAlignment 'core' data and character data lengths
555         unsigned int tempValue;
556         unsigned int queryNameLength;
557         unsigned int numCigarOperations;
558         unsigned int querySequenceLength;
559
560         //bAlignment.RefID    = BgzfUnpackUnsignedInt(&x[0]);
561         //bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
562         bAlignment.RefID    = BgzfUnpackSignedInt(&x[0]);
563         bAlignment.Position = BgzfUnpackSignedInt(&x[4]);
564
565         tempValue             = BgzfUnpackUnsignedInt(&x[8]);           
566         bAlignment.Bin        = tempValue >> 16;
567         bAlignment.MapQuality = tempValue >> 8 & 0xff;
568         queryNameLength       = tempValue & 0xff;
569
570         tempValue                = BgzfUnpackUnsignedInt(&x[12]);       
571         bAlignment.AlignmentFlag = tempValue >> 16;
572         numCigarOperations       = tempValue & 0xffff;
573
574         querySequenceLength     = BgzfUnpackUnsignedInt(&x[16]);
575         //bAlignment.MateRefID    = BgzfUnpackUnsignedInt(&x[20]);
576         //bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
577         //bAlignment.InsertSize   = BgzfUnpackUnsignedInt(&x[28]);
578         bAlignment.MateRefID    = BgzfUnpackSignedInt(&x[20]);
579         bAlignment.MatePosition = BgzfUnpackSignedInt(&x[24]);
580         bAlignment.InsertSize   = BgzfUnpackSignedInt(&x[28]);
581
582         // calculate lengths/offsets
583         const unsigned int dataLength      = blockLength - BAM_CORE_SIZE;
584         const unsigned int cigarDataOffset = queryNameLength;
585         const unsigned int seqDataOffset   = cigarDataOffset + (numCigarOperations * 4);
586         const unsigned int qualDataOffset  = seqDataOffset + (querySequenceLength+1)/2;
587         const unsigned int tagDataOffset   = qualDataOffset + querySequenceLength;
588         const unsigned int tagDataLen      = dataLength - tagDataOffset;
589         
590         // set up destination buffers for character data
591         char* allCharData   = (char*)calloc(sizeof(char), dataLength);
592         uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
593         char* seqData       = ((char*)allCharData) + seqDataOffset;
594         char* qualData      = ((char*)allCharData) + qualDataOffset;
595         char* tagData       = ((char*)allCharData) + tagDataOffset;
596         
597         // get character data - make sure proper data size was read
598         if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
599         else {
600                 
601                 bytesRead += dataLength;
602                 
603                 // clear out any previous string data
604                 bAlignment.Name.clear();
605                 bAlignment.QueryBases.clear();
606                 bAlignment.Qualities.clear();
607                 bAlignment.AlignedBases.clear();
608                 bAlignment.CigarData.clear();
609                 bAlignment.TagData.clear();
610                 
611                 // save name
612                 bAlignment.Name = (string)((const char*)(allCharData));
613                 
614                 // save query sequence
615                 for (unsigned int i = 0; i < querySequenceLength; ++i) {        
616                         char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
617                         bAlignment.QueryBases.append( 1, singleBase );
618                 }
619                 
620                 // save sequence length
621                 bAlignment.Length = bAlignment.QueryBases.length();
622                 
623                 // save qualities
624                 for (unsigned int i = 0; i < querySequenceLength; ++i) { 
625                         char singleQuality = (char)(qualData[i]+33);                    // conversion from QV to FASTQ character
626                         bAlignment.Qualities.append( 1, singleQuality );
627                 }
628                 
629                 // save CIGAR-related data;
630                 int k = 0;
631                 for (unsigned int i = 0; i < numCigarOperations; ++i) {
632                         
633                         // build CigarOp struct
634                         CigarOp op;
635                         op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
636                         op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
637                         
638                         // save CigarOp
639                         bAlignment.CigarData.push_back(op);
640                         
641                         // build AlignedBases string
642                         switch (op.Type) {
643                                 
644                                 case ('M') : 
645                                 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
646                                 case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
647                                                          break;
648                                                          
649                                 case ('D') : bAlignment.AlignedBases.append( op.Length, '-' );  // for 'D' - write gap character
650                                                          break;
651                                                         
652                                 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'P' - write padding character;
653                                                          break;
654                                                          
655                                 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
656                                                          k += op.Length;
657                                                          break;
658                                                          
659                                 case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
660                                                          
661                                 default    : printf("ERROR: Invalid Cigar op type\n");                  // shouldn't get here
662                                                          exit(1);
663                         }
664                 }
665                 
666                 // read in the tag data
667                 bAlignment.TagData.resize(tagDataLen);
668                 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
669         }
670
671         free(allCharData);
672         return true;
673 }
674
675 void BamReader::LoadReferenceData(void) {
676
677         // get number of reference sequences
678         char buffer[4];
679         BgzfRead(buffer, 4);
680         const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
681         if (numberRefSeqs == 0) { return; }
682         m_references.reserve((int)numberRefSeqs);
683         
684         // iterate over all references in header
685         for (unsigned int i = 0; i != numberRefSeqs; ++i) {
686
687                 // get length of reference name
688                 BgzfRead(buffer, 4);
689                 const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
690                 char* refName = (char*)calloc(refNameLength, 1);
691                 
692                 // get reference name and reference sequence length
693                 BgzfRead(refName, refNameLength);
694                 BgzfRead(buffer, 4);
695                 const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
696                 
697                 // store data for reference
698                 RefData aReference;
699                 aReference.RefName   = (string)((const char*)refName);
700                 aReference.RefLength = refLength;
701                 m_references.push_back(aReference);
702                 
703                 // clean up calloc-ed temp variable
704                 free(refName);
705         }
706 }
707
708 // opens BAM file (and index)
709 void BamReader::Open(const string& filename, const string& indexFilename) {
710
711         // open the BGZF file for reading, retrieve header text & reference data
712         m_BGZF = new BgzfData;
713         BgzfOpen(filename);
714         LoadHeaderData();       
715         LoadReferenceData();
716
717         // store file offset of first alignment
718         m_alignmentsBeginOffset = BgzfTell();
719
720         // open index file & load index data (if exists)
721         OpenIndex(indexFilename);
722 }
723
724 void BamReader::OpenIndex(const string& indexFilename) {
725
726         // if index file exists
727         if (!indexFilename.empty()) {
728
729                 // open index
730                 FILE* indexStream = fopen(indexFilename.c_str(), "rb");
731                 
732                 // abort on error
733                 if(!indexStream) {
734                         printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
735                         exit(1);
736                 }
737         
738                 // build up index data structure
739                 LoadIndexData(indexStream);
740         }
741 }
742
743 bool BamReader::Rewind(void) {
744
745         // find first reference that has alignments in the BAM file
746         int refID = 0;
747         int refCount = m_references.size();
748         for ( ; refID < refCount; ++refID ) {
749                 if ( m_references.at(refID).RefHasAlignments ) { break; } 
750         }
751
752         // store default bounds for first alignment
753         m_currentRefID = refID;
754         m_currentLeft = 0;
755         m_isRegionSpecified = false;
756
757         // return success/failure of seek
758         return BgzfSeek(m_alignmentsBeginOffset);
759 }