]> git.donarmstrong.com Git - bamtools.git/blob - BamReader.cpp
Major overhaul of BamReader. No longer relying on bgzf.* API. Sped up random-access...
[bamtools.git] / BamReader.cpp
1 // ***************************************************************************
2 // BamReader (c) 2009 Derek Barnett
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Provides the basic functionality for reading BAM files
7 // ***************************************************************************
8
9 #include "BamReader.h"
10 //#include "STLUtilities.h"
11
12 #include <cstring>
13
14 #include <iostream>
15 using std::cerr;
16 using std::endl;
17
18 // static character constants
19 const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
20 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
21
22 // constructor
23 BamReader::BamReader(void)
24         : m_index(NULL)
25         , m_isIndexLoaded(false)
26         , m_alignmentsBeginOffset(0)
27         , m_isRegionSpecified(false)
28         , m_currentRefID(0)
29         , m_currentLeft(0)
30 { }
31
32 // destructor
33 BamReader::~BamReader(void) {
34         m_headerText.clear();
35         ClearIndex();
36         Close();
37 }
38
39 // checks BGZF block header
40 bool BamReader::BgzfCheckBlockHeader(char* header) {
41
42         return (header[0] == GZIP_ID1 &&
43             header[1] == (char)GZIP_ID2 &&
44             header[2] == Z_DEFLATED &&
45             (header[3] & FLG_FEXTRA) != 0 &&
46             BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
47             header[12] == BGZF_ID1 &&
48             header[13] == BGZF_ID2 &&
49             BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN);
50 }
51
52 // closes the BAM file
53 void BamReader::BgzfClose(void) {
54         m_BGZF.IsOpen = false;
55         fclose(m_BGZF.Stream);
56 }
57
58 // de-compresses the current block
59 int BamReader::BgzfInflateBlock(int blockLength) {
60         
61         // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
62     z_stream zs;
63     zs.zalloc    = NULL;
64     zs.zfree     = NULL;
65     zs.next_in   = (Bytef*)m_BGZF.CompressedBlock + 18;
66     zs.avail_in  = blockLength - 16;
67     zs.next_out  = (Bytef*)m_BGZF.UncompressedBlock;
68     zs.avail_out = m_BGZF.UncompressedBlockSize;
69
70     int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
71     if (status != Z_OK) {
72         printf("inflateInit failed\n");
73         exit(1);
74     }
75
76     status = inflate(&zs, Z_FINISH);
77     if (status != Z_STREAM_END) {
78         inflateEnd(&zs);
79         printf("inflate failed\n");
80         exit(1);
81     }
82
83     status = inflateEnd(&zs);
84     if (status != Z_OK) {
85         printf("inflateEnd failed\n");
86         exit(1);
87     }
88
89     return zs.total_out;
90 }
91
92 // opens the BAM file for reading
93 void BamReader::BgzfOpen(const string& filename) {
94
95         m_BGZF.Stream = fopen(filename.c_str(), "rb");
96         if(!m_BGZF.Stream) {
97                 cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl;
98                 exit(1);
99         }
100
101         m_BGZF.IsOpen = true;
102 }
103
104 // reads BGZF data into buffer
105 unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
106
107     if (dataLength == 0) { return 0; }
108
109         char* output = data;
110     unsigned int numBytesRead = 0;
111     while (numBytesRead < dataLength) {
112
113         int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
114         if (bytesAvailable <= 0) {
115             if ( BgzfReadBlock() != 0 ) { return -1; }
116             bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
117             if ( bytesAvailable <= 0 ) { break; }
118         }
119
120                 char* buffer   = m_BGZF.UncompressedBlock;
121         int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
122         memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
123
124         m_BGZF.BlockOffset += copyLength;
125         output             += copyLength;
126         numBytesRead       += copyLength;
127     }
128
129     if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
130                 m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);                                          
131         m_BGZF.BlockOffset  = 0;
132         m_BGZF.BlockLength  = 0;
133     }
134
135         return numBytesRead;
136 }
137
138 int BamReader::BgzfReadBlock(void) {
139
140     char    header[BLOCK_HEADER_LENGTH];
141     int64_t blockAddress = ftello(m_BGZF.Stream);
142
143     int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
144         if (count == 0) {
145         m_BGZF.BlockLength = 0;
146         return 0;
147     }
148
149     if (count != sizeof(header)) {
150         printf("read block failed - count != sizeof(header)\n");
151         return -1;
152     }
153
154     if (!BgzfCheckBlockHeader(header)) {
155         printf("read block failed - CheckBgzfBlockHeader() returned false\n");
156         return -1;
157     }
158
159     int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
160     char* compressedBlock = m_BGZF.CompressedBlock;
161     memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
162     int remaining = blockLength - BLOCK_HEADER_LENGTH;
163
164     count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
165     if (count != remaining) {
166         printf("read block failed - count != remaining\n");
167         return -1;
168     }
169
170     count = BgzfInflateBlock(blockLength);
171     if (count < 0) { return -1; }
172
173     if (m_BGZF.BlockLength != 0) {
174         m_BGZF.BlockOffset = 0;
175     }
176
177     m_BGZF.BlockAddress = blockAddress;
178     m_BGZF.BlockLength  = count;
179     return 0;
180 }
181
182 // move file pointer to specified offset
183 bool BamReader::BgzfSeek(int64_t position) {
184
185         int     blockOffset  = (position & 0xFFFF);
186     int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
187     if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
188         printf("ERROR: Unable to seek in BAM file\n");
189                 exit(1);
190     }
191
192     m_BGZF.BlockLength  = 0;
193     m_BGZF.BlockAddress = blockAddress;
194     m_BGZF.BlockOffset  = blockOffset;
195         return true;
196 }
197
198 // get file position in BAM file
199 int64_t BamReader::BgzfTell(void) {
200         return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
201 }
202
203 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
204
205         // get region boundaries
206         uint32_t begin = left;
207         uint32_t end   = m_references.at(refID).RefLength - 1;
208
209         // initialize list, bin '0' always a valid bin
210         int i = 0;
211         list[i++] = 0;
212
213         // get rest of bins that contain this region
214         unsigned int k;
215         for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
216         for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
217         for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
218         for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
219         for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
220         
221         // return number of bins stored
222         return i;
223 }
224
225 unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
226
227         // initialize alignment end to starting position
228         unsigned int alignEnd = position;
229
230         // iterate over cigar operations
231         vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
232         vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
233         for ( ; cigarIter != cigarEnd; ++cigarIter) {
234                 char cigarType = (*cigarIter).Type;
235                 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
236                         alignEnd += (*cigarIter).Length;
237                 }
238         }
239         return alignEnd;
240 }
241
242 void BamReader::ClearIndex(void) {
243
244         if ( m_index ) {
245                 // iterate over references
246                 vector<RefIndex*>::iterator refIter = m_index->begin();
247                 vector<RefIndex*>::iterator refEnd  = m_index->end();
248                 for ( ; refIter != refEnd; ++refIter) {
249                         RefIndex* aRef = (*refIter);
250                         if ( aRef ) {
251                                 // clear out BAM bins
252                                 if ( aRef->first ) {
253                                         BinVector::iterator binIter = (aRef->first)->begin();
254                                         BinVector::iterator binEnd  = (aRef->first)->end();
255                                         for ( ; binIter != binEnd; ++binIter ) {
256                                                 ChunkVector* chunks = (*binIter).second;
257                                                 if ( chunks ) { delete chunks; }
258                                         }
259                                         delete aRef->first;
260                                 }
261                                 // clear BAM linear offsets
262                                 if ( aRef->second ) { delete aRef->second; }
263                                 delete aRef;
264                         }
265                 }
266                 delete m_index;
267         }
268 }
269
270 // closes the BAM file
271 void BamReader::Close(void) {
272         if(m_BGZF.IsOpen) { BgzfClose(); }
273         if(m_index)       { delete m_index; m_index = NULL; }
274         m_isRegionSpecified = false;
275 }
276
277 const string BamReader::GetHeaderText(void) const {
278         return m_headerText;
279 }
280
281 const int BamReader::GetReferenceCount(void) const {
282         return m_references.size();
283 }
284
285 const RefVector BamReader::GetReferenceData(void) const {
286         return m_references;
287 }
288
289 const int BamReader::GetReferenceID(const string& refName) const {
290
291         // retrieve names from reference data
292         vector<string> refNames;
293         RefVector::const_iterator refIter = m_references.begin();
294     RefVector::const_iterator refEnd  = m_references.end();
295     for ( ; refIter != refEnd; ++refIter) {
296                 refNames.push_back( (*refIter).RefName );
297     }
298
299         // return 'index-of' refName ( if not found, returns refNames.size() )
300         return Index( refNames.begin(), refNames.end(), refName );
301 }
302
303 // get next alignment (from specified region, if given)
304 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
305
306         // if valid alignment available
307         if ( LoadNextAlignment(bAlignment) ) {
308                 
309                 // if region not specified, return success
310                 if ( !m_isRegionSpecified ) { return true; }
311                 
312                 // load next alignment until region overlap is found
313                 while ( !IsOverlap(bAlignment) ) {
314                         // if no valid alignment available (likely EOF) return failure
315                         if ( !LoadNextAlignment(bAlignment) ) { return false; }
316                 }
317                 
318                 // return success (alignment found that overlaps region)
319                 return true;
320         } 
321         
322         // no valid alignment
323         else { return false; }
324 }
325
326 int64_t BamReader::GetOffset(int refID, unsigned int left) {
327
328         // calculate which bins overlap this region
329         uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
330         int numBins = BinsFromRegion(refID, left, bins);
331
332         // get bins for this reference
333         RefIndex* refIndex = m_index->at(refID);
334         BinVector* refBins = refIndex->first;
335
336         // get minimum offset to consider
337         LinearOffsetVector* linearOffsets = refIndex->second;
338         uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
339
340         // store offsets to beginning of alignment 'chunks' 
341         std::vector<int64_t> chunkStarts;
342         
343         // reference bin iterators
344         BinVector::const_iterator binIter;
345         BinVector::const_iterator binBegin = refBins->begin();
346         BinVector::const_iterator binEnd   = refBins->end();
347         
348         // store all alignment 'chunk' starts for bins in this region
349         for (int i = 0; i < numBins; ++i ) {
350                 binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
351                 if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
352                         ChunkVector* binChunks = (*binIter).second;
353                         ChunkVector::const_iterator chunkIter = binChunks->begin();
354                         ChunkVector::const_iterator chunkEnd  = binChunks->end();
355                         for ( ; chunkIter != chunkEnd; ++chunkIter) {
356                                 if ( (*chunkIter).second > minOffset ) {
357                                         chunkStarts.push_back( (*chunkIter).first );
358                                 }       
359                         }
360                 }
361         }
362         
363         // clean up memory
364         free(bins);
365         
366         // if no alignments found
367         if ( chunkStarts.empty() ) { return -1; }
368         
369         // else return smallest offset for alignment starts 
370         else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
371 }
372
373 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
374
375         // if on different reference sequence, quit
376         if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
377
378         // read starts after left boundary
379         if ( bAlignment.Position >= m_currentLeft) { return true; }
380
381         // return whether alignment end overlaps left boundary
382         return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
383 }
384
385 bool BamReader::Jump(int refID, unsigned int left) {
386
387         // if index available, and region is valid
388         if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
389                 m_currentRefID = refID;
390                 m_currentLeft  = left;
391                 m_isRegionSpecified = true;
392                 
393                 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
394                 if ( offset == -1 ) { return false; }
395                 else { return BgzfSeek(offset); }
396         }
397         return false;
398 }
399
400 void BamReader::LoadHeaderData(void) {
401         
402         // check to see if proper BAM header
403         char buffer[4];
404         if (BgzfRead(buffer, 4) != 4) { 
405                 printf("Could not read header type\n");
406                 exit(1); 
407         }
408         if (strncmp(buffer, "BAM\001", 4)) {
409                 printf("wrong header type!\n");
410                 exit(1);
411         }
412         
413         // get BAM header text length
414         BgzfRead(buffer, 4);
415         const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
416
417         // get BAM header text
418         char* headerText = (char*)calloc(headerTextLength + 1, 1);
419         BgzfRead(headerText, headerTextLength);
420         m_headerText = (string)((const char*)headerText);
421         
422         // clean up calloc-ed temp variable
423         free(headerText);
424 }
425
426 void BamReader::LoadIndexData(FILE* indexStream) {
427
428         // see if index is valid BAM index
429         char magic[4];
430         fread(magic, 1, 4, indexStream);
431         if (strncmp(magic, "BAI\1", 4)) {
432                 printf("Problem with index file - invalid format.\n");
433                 fclose(indexStream);
434                 exit(1);
435         }
436
437         // get number of reference sequences
438         uint32_t numRefSeqs;
439         fread(&numRefSeqs, 4, 1, indexStream);
440         
441         // intialize BamIndex data structure
442         m_index = new BamIndex;
443         m_index->reserve(numRefSeqs);
444
445         // iterate over reference sequences
446         for (unsigned int i = 0; i < numRefSeqs; ++i) {
447                 
448                 // get number of bins for this reference sequence
449                 int32_t numBins;
450                 fread(&numBins, 4, 1, indexStream);
451                 
452                 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
453
454                 // intialize BinVector
455                 BinVector* bins = new BinVector;
456                 bins->reserve(numBins);
457                 
458                 // iterate over bins for that reference sequence
459                 for (int j = 0; j < numBins; ++j) {
460                         
461                         // get binID 
462                         uint32_t binID;
463                         fread(&binID, 4, 1, indexStream);
464                         
465                         // get number of regionChunks in this bin
466                         uint32_t numChunks;
467                         fread(&numChunks, 4, 1, indexStream);
468                         
469                         // intialize ChunkVector
470                         ChunkVector* regionChunks = new ChunkVector;
471                         regionChunks->reserve(numChunks);
472                         
473                         // iterate over regionChunks in this bin
474                         for (unsigned int k = 0; k < numChunks; ++k) {
475                                 
476                                 // get chunk boundaries (left, right) 
477                                 uint64_t left;
478                                 uint64_t right;
479                                 fread(&left, 8, 1, indexStream);
480                                 fread(&right, 8, 1, indexStream);
481                                 
482                                 // save ChunkPair
483                                 regionChunks->push_back( ChunkPair(left, right) );
484                         }
485                         
486                         // sort chunks for this bin
487                         sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
488
489                         // save binID, chunkVector for this bin
490                         bins->push_back( BamBin(binID, regionChunks) );
491                 }
492                 
493                 // sort bins by binID
494                 sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
495
496                 // load linear index for this reference sequence
497                 
498                 // get number of linear offsets
499                 int32_t numLinearOffsets;
500                 fread(&numLinearOffsets, 4, 1, indexStream);
501                 
502                 // intialize LinearOffsetVector
503                 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
504                 linearOffsets->reserve(numLinearOffsets);
505                 
506                 // iterate over linear offsets for this reference sequeence
507                 for (int j = 0; j < numLinearOffsets; ++j) {
508                         // get a linear offset
509                         uint64_t linearOffset;
510                         fread(&linearOffset, 8, 1, indexStream);
511                         // store linear offset
512                         linearOffsets->push_back(linearOffset);
513                 }
514                 
515                 // sort linear offsets
516                 sort( linearOffsets->begin(), linearOffsets->end() );
517
518                 // store index data for that reference sequence
519                 m_index->push_back( new RefIndex(bins, linearOffsets) );
520         }
521         
522         // close index file (.bai) and return
523         fclose(indexStream);
524 }
525
526 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
527
528         // read in the 'block length' value, make sure it's not zero
529         char buffer[4];
530         BgzfRead(buffer, 4);
531         const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
532         if ( blockLength == 0 ) { return false; }
533
534         // keep track of bytes read as method progresses
535         int bytesRead = 4;
536
537         // read in core alignment data, make sure the right size of data was read 
538         char x[BAM_CORE_SIZE];
539         if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
540         bytesRead += BAM_CORE_SIZE;
541
542         // set BamAlignment 'core' data and character data lengths
543         unsigned int tempValue;
544         unsigned int queryNameLength;
545         unsigned int numCigarOperations;
546         unsigned int querySequenceLength;
547
548         bAlignment.RefID    = BgzfUnpackUnsignedInt(&x[0]);
549         bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
550
551         tempValue             = BgzfUnpackUnsignedInt(&x[8]);           
552         bAlignment.Bin        = tempValue >> 16;
553         bAlignment.MapQuality = tempValue >> 8 & 0xff;
554         queryNameLength       = tempValue & 0xff;
555
556         tempValue                = BgzfUnpackUnsignedInt(&x[12]);       
557         bAlignment.AlignmentFlag = tempValue >> 16;
558         numCigarOperations       = tempValue & 0xffff;
559
560         querySequenceLength     = BgzfUnpackUnsignedInt(&x[16]);
561         bAlignment.MateRefID    = BgzfUnpackUnsignedInt(&x[20]);
562         bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
563         bAlignment.InsertSize   = BgzfUnpackUnsignedInt(&x[28]);
564
565         // calculate lengths/offsets
566         const unsigned int dataLength      = blockLength - BAM_CORE_SIZE;
567         const unsigned int cigarDataOffset = queryNameLength;
568         const unsigned int seqDataOffset   = cigarDataOffset + (numCigarOperations * 4);
569         const unsigned int qualDataOffset  = seqDataOffset + (querySequenceLength+1)/2;
570         const unsigned int tagDataOffset   = qualDataOffset + querySequenceLength;
571         const unsigned int tagDataLen      = dataLength - tagDataOffset;
572         
573         // set up destination buffers for character data
574         char* allCharData   = (char*)calloc(sizeof(char), dataLength);
575         uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
576         char* seqData       = ((char*)allCharData) + seqDataOffset;
577         char* qualData      = ((char*)allCharData) + qualDataOffset;
578         char* tagData       = ((char*)allCharData) + tagDataOffset;
579         
580         // get character data - make sure proper data size was read
581         if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
582         else {
583                 
584                 bytesRead += dataLength;
585                 
586                 // clear out any previous string data
587                 bAlignment.Name.clear();
588                 bAlignment.QueryBases.clear();
589                 bAlignment.Qualities.clear();
590                 bAlignment.AlignedBases.clear();
591                 bAlignment.CigarData.clear();
592                 bAlignment.TagData.clear();
593                 
594                 // save name
595                 bAlignment.Name = (string)((const char*)(allCharData));
596                 
597                 // save query sequence
598                 for (unsigned int i = 0; i < querySequenceLength; ++i) {        
599                         char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
600                         bAlignment.QueryBases.append( 1, singleBase );
601                 }
602                 
603                 // save sequence length
604                 bAlignment.Length = bAlignment.QueryBases.length();
605                 
606                 // save qualities
607                 for (unsigned int i = 0; i < querySequenceLength; ++i) { 
608                         char singleQuality = (char)(qualData[i]+33);                    // conversion from QV to FASTQ character
609                         bAlignment.Qualities.append( 1, singleQuality );
610                 }
611                 
612                 // save CIGAR-related data;
613                 int k = 0;
614                 for (unsigned int i = 0; i < numCigarOperations; ++i) {
615                         
616                         // build CigarOp struct
617                         CigarOp op;
618                         op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
619                         op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
620                         
621                         // save CigarOp
622                         bAlignment.CigarData.push_back(op);
623                         
624                         // build AlignedBases string
625                         switch (op.Type) {
626                                 
627                                 case ('M') : 
628                                 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
629                                 case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
630                                                          break;
631                                                          
632                                 case ('D') : 
633                                 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;
634                                                          break;
635                                                          
636                                 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
637                                                          k += op.Length;
638                                                          break;
639                                                          
640                                 case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
641                                                          
642                                 default    : printf("ERROR: Invalid Cigar op type\n");                  // shouldn't get here
643                                                          exit(1);
644                         }
645                 }
646                 
647                 // read in the tag data
648                 bAlignment.TagData.resize(tagDataLen);
649                 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
650         }
651
652         free(allCharData);
653         return true;
654 }
655
656 void BamReader::LoadReferenceData(void) {
657
658         // get number of reference sequences
659         char buffer[4];
660         BgzfRead(buffer, 4);
661         const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
662         if (numberRefSeqs == 0) { return; }
663         m_references.reserve((int)numberRefSeqs);
664         
665         // iterate over all references in header
666         for (unsigned int i = 0; i != numberRefSeqs; ++i) {
667
668                 // get length of reference name
669                 BgzfRead(buffer, 4);
670                 const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
671                 char* refName = (char*)calloc(refNameLength, 1);
672                 
673                 // get reference name and reference sequence length
674                 BgzfRead(refName, refNameLength);
675                 BgzfRead(buffer, 4);
676                 const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
677                 
678                 // store data for reference
679                 RefData aReference;
680                 aReference.RefName   = (string)((const char*)refName);
681                 aReference.RefLength = refLength;
682                 m_references.push_back(aReference);
683                 
684                 // clean up calloc-ed temp variable
685                 free(refName);
686         }
687 }
688
689 // opens BAM file (and index)
690 void BamReader::Open(const string& filename, const string& indexFilename) {
691
692         // open the BGZF file for reading, retrieve header text & reference data
693         BgzfOpen(filename);
694         LoadHeaderData();       
695         LoadReferenceData();
696
697         // store file offset of first alignment
698         m_alignmentsBeginOffset = BgzfTell();
699
700         // open index file & load index data (if exists)
701         OpenIndex(indexFilename);
702 }
703
704 void BamReader::OpenIndex(const string& indexFilename) {
705
706         // if index file exists
707         if (!indexFilename.empty()) {
708
709                 // open index
710                 FILE* indexStream = fopen(indexFilename.c_str(), "rb");
711                 
712                 // abort on error
713                 if(!indexStream) {
714                         cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl;
715                         exit(1);
716                 }
717         
718                 // build up index data structure
719                 LoadIndexData(indexStream);
720         }
721 }
722
723 bool BamReader::Rewind(void) {
724
725         // find first reference that has alignments in the BAM file
726         int refID = 0;
727         int refCount = m_references.size();
728         for ( ; refID < refCount; ++refID ) {
729                 if ( m_references.at(refID).RefHasAlignments ) { break; } 
730         }
731
732         // store default bounds for first alignment
733         m_currentRefID = refID;
734         m_currentLeft = 0;
735         m_isRegionSpecified = false;
736
737         // return success/failure of seek
738         return BgzfSeek(m_alignmentsBeginOffset);
739 }