]> git.donarmstrong.com Git - bamtools.git/blob - BamReader.cpp
Full update to SVN after combining BamReader and BamWriter into cohesive BamTools...
[bamtools.git] / BamReader.cpp
1 // ***************************************************************************
2 // BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 15 July 2009 (DB)
7 // ---------------------------------------------------------------------------
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
9 // Institute.
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for reading BAM files
12 // ***************************************************************************
13
14 // BamTools includes
15 #include "BamReader.h"
16 using namespace BamTools;
17 using namespace std;
18
19 // static character constants
20 const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
21 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
22
23 // constructor
24 BamReader::BamReader(void)
25         : m_index(NULL)
26         , m_isIndexLoaded(false)
27         , m_alignmentsBeginOffset(0)
28         , m_isRegionSpecified(false)
29         , m_currentRefID(0)
30         , m_currentLeft(0)
31 { }
32
33 // destructor
34 BamReader::~BamReader(void) {
35         Close();
36 }
37
38 // checks BGZF block header
39 bool BamReader::BgzfCheckBlockHeader(char* header) {
40
41         return (header[0] == GZIP_ID1 &&
42             header[1] == (char)GZIP_ID2 &&
43             header[2] == Z_DEFLATED &&
44             (header[3] & FLG_FEXTRA) != 0 &&
45             BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
46             header[12] == BGZF_ID1 &&
47             header[13] == BGZF_ID2 &&
48             BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
49                    );
50 }
51
52 // closes the BAM file
53 void BamReader::BgzfClose(void) {
54         m_BGZF.IsOpen = false;
55         fclose(m_BGZF.Stream);
56 }
57
58 // de-compresses the current block
59 int BamReader::BgzfInflateBlock(int blockLength) {
60         
61         // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
62     z_stream zs;
63     zs.zalloc    = NULL;
64     zs.zfree     = NULL;
65     zs.next_in   = (Bytef*)m_BGZF.CompressedBlock + 18;
66     zs.avail_in  = blockLength - 16;
67     zs.next_out  = (Bytef*)m_BGZF.UncompressedBlock;
68     zs.avail_out = m_BGZF.UncompressedBlockSize;
69
70     int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
71     if (status != Z_OK) {
72         printf("inflateInit failed\n");
73         exit(1);
74     }
75
76     status = inflate(&zs, Z_FINISH);
77     if (status != Z_STREAM_END) {
78         inflateEnd(&zs);
79         printf("inflate failed\n");
80         exit(1);
81     }
82
83     status = inflateEnd(&zs);
84     if (status != Z_OK) {
85         printf("inflateEnd failed\n");
86         exit(1);
87     }
88
89     return zs.total_out;
90 }
91
92 // opens the BAM file for reading
93 void BamReader::BgzfOpen(const string& filename) {
94
95         m_BGZF.Stream = fopen(filename.c_str(), "rb");
96         if(!m_BGZF.Stream) {
97                 printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
98                 exit(1);
99         }
100
101         m_BGZF.IsOpen = true;
102 }
103
104 // reads BGZF data into buffer
105 unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
106
107     if (dataLength == 0) { return 0; }
108
109         char* output = data;
110     unsigned int numBytesRead = 0;
111     while (numBytesRead < dataLength) {
112
113         int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
114         if (bytesAvailable <= 0) {
115             if ( BgzfReadBlock() != 0 ) { return -1; }
116             bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
117             if ( bytesAvailable <= 0 ) { break; }
118         }
119
120                 char* buffer   = m_BGZF.UncompressedBlock;
121         int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
122         memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
123
124         m_BGZF.BlockOffset += copyLength;
125         output             += copyLength;
126         numBytesRead       += copyLength;
127     }
128
129     if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
130                 m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);                                          
131         m_BGZF.BlockOffset  = 0;
132         m_BGZF.BlockLength  = 0;
133     }
134
135         return numBytesRead;
136 }
137
138 int BamReader::BgzfReadBlock(void) {
139
140     char    header[BLOCK_HEADER_LENGTH];
141     int64_t blockAddress = ftello(m_BGZF.Stream);
142
143     int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
144         if (count == 0) {
145         m_BGZF.BlockLength = 0;
146         return 0;
147     }
148
149     if (count != sizeof(header)) {
150         printf("read block failed - count != sizeof(header)\n");
151         return -1;
152     }
153
154     if (!BgzfCheckBlockHeader(header)) {
155         printf("read block failed - CheckBgzfBlockHeader() returned false\n");
156         return -1;
157     }
158
159     int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
160     char* compressedBlock = m_BGZF.CompressedBlock;
161     memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
162     int remaining = blockLength - BLOCK_HEADER_LENGTH;
163
164     count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
165     if (count != remaining) {
166         printf("read block failed - count != remaining\n");
167         return -1;
168     }
169
170     count = BgzfInflateBlock(blockLength);
171     if (count < 0) { return -1; }
172
173     if (m_BGZF.BlockLength != 0) {
174         m_BGZF.BlockOffset = 0;
175     }
176
177     m_BGZF.BlockAddress = blockAddress;
178     m_BGZF.BlockLength  = count;
179     return 0;
180 }
181
182 // move file pointer to specified offset
183 bool BamReader::BgzfSeek(int64_t position) {
184
185         int     blockOffset  = (position & 0xFFFF);
186     int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
187     if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
188         printf("ERROR: Unable to seek in BAM file\n");
189                 exit(1);
190     }
191
192     m_BGZF.BlockLength  = 0;
193     m_BGZF.BlockAddress = blockAddress;
194     m_BGZF.BlockOffset  = blockOffset;
195         return true;
196 }
197
198 // get file position in BAM file
199 int64_t BamReader::BgzfTell(void) {
200         return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
201 }
202
203 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
204
205         // get region boundaries
206         uint32_t begin = left;
207         uint32_t end   = m_references.at(refID).RefLength - 1;
208
209         // initialize list, bin '0' always a valid bin
210         int i = 0;
211         list[i++] = 0;
212
213         // get rest of bins that contain this region
214         unsigned int k;
215         for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
216         for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
217         for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
218         for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
219         for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
220         
221         // return number of bins stored
222         return i;
223 }
224
225 unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
226
227         // initialize alignment end to starting position
228         unsigned int alignEnd = position;
229
230         // iterate over cigar operations
231         vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
232         vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
233         for ( ; cigarIter != cigarEnd; ++cigarIter) {
234                 char cigarType = (*cigarIter).Type;
235                 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
236                         alignEnd += (*cigarIter).Length;
237                 }
238         }
239         return alignEnd;
240 }
241
242 void BamReader::ClearIndex(void) {
243
244         if ( m_index ) {
245                 // iterate over references
246                 vector<RefIndex*>::iterator refIter = m_index->begin();
247                 vector<RefIndex*>::iterator refEnd  = m_index->end();
248                 for ( ; refIter != refEnd; ++refIter) {
249                         RefIndex* aRef = (*refIter);
250                         if ( aRef ) {
251                                 // clear out BAM bins
252                                 if ( aRef->first ) {
253                                         BinVector::iterator binIter = (aRef->first)->begin();
254                                         BinVector::iterator binEnd  = (aRef->first)->end();
255                                         for ( ; binIter != binEnd; ++binIter ) {
256                                                 ChunkVector* chunks = (*binIter).second;
257                                                 if ( chunks ) { delete chunks; chunks = NULL;}
258                                         }
259                                         delete aRef->first;
260                                         aRef->first = NULL;
261                                 }
262                                 // clear BAM linear offsets
263                                 if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
264                                 delete aRef;
265                                 aRef = NULL;
266                         }
267                 }
268                 delete m_index;
269                 m_index = NULL;
270         }
271 }
272
273 // closes the BAM file
274 void BamReader::Close(void) {
275         if(m_BGZF.IsOpen) { BgzfClose(); }      
276         ClearIndex();
277         m_headerText.clear();
278         m_isRegionSpecified = false;
279 }
280
281 const string BamReader::GetHeaderText(void) const {
282         return m_headerText;
283 }
284
285 const int BamReader::GetReferenceCount(void) const {
286         return m_references.size();
287 }
288
289 const RefVector BamReader::GetReferenceData(void) const {
290         return m_references;
291 }
292
293 const int BamReader::GetReferenceID(const string& refName) const {
294
295         // retrieve names from reference data
296         vector<string> refNames;
297         RefVector::const_iterator refIter = m_references.begin();
298     RefVector::const_iterator refEnd  = m_references.end();
299     for ( ; refIter != refEnd; ++refIter) {
300                 refNames.push_back( (*refIter).RefName );
301     }
302
303         // return 'index-of' refName ( if not found, returns refNames.size() )
304         return Index( refNames.begin(), refNames.end(), refName );
305 }
306
307 // get next alignment (from specified region, if given)
308 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
309
310         // if valid alignment available
311         if ( LoadNextAlignment(bAlignment) ) {
312                 
313                 // if region not specified, return success
314                 if ( !m_isRegionSpecified ) { return true; }
315                 
316                 // load next alignment until region overlap is found
317                 while ( !IsOverlap(bAlignment) ) {
318                         // if no valid alignment available (likely EOF) return failure
319                         if ( !LoadNextAlignment(bAlignment) ) { return false; }
320                 }
321                 
322                 // return success (alignment found that overlaps region)
323                 return true;
324         } 
325         
326         // no valid alignment
327         else { return false; }
328 }
329
330 int64_t BamReader::GetOffset(int refID, unsigned int left) {
331
332         // calculate which bins overlap this region
333         uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
334         int numBins = BinsFromRegion(refID, left, bins);
335
336         // get bins for this reference
337         RefIndex* refIndex = m_index->at(refID);
338         BinVector* refBins = refIndex->first;
339
340         // get minimum offset to consider
341         LinearOffsetVector* linearOffsets = refIndex->second;
342         uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
343
344         // store offsets to beginning of alignment 'chunks' 
345         std::vector<int64_t> chunkStarts;
346         
347         // reference bin iterators
348         BinVector::const_iterator binIter;
349         BinVector::const_iterator binBegin = refBins->begin();
350         BinVector::const_iterator binEnd   = refBins->end();
351         
352         // store all alignment 'chunk' starts for bins in this region
353         for (int i = 0; i < numBins; ++i ) {
354                 binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
355                 if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
356                         ChunkVector* binChunks = (*binIter).second;
357                         ChunkVector::const_iterator chunkIter = binChunks->begin();
358                         ChunkVector::const_iterator chunkEnd  = binChunks->end();
359                         for ( ; chunkIter != chunkEnd; ++chunkIter) {
360                                 if ( (*chunkIter).second > minOffset ) {
361                                         chunkStarts.push_back( (*chunkIter).first );
362                                 }       
363                         }
364                 }
365         }
366         
367         // clean up memory
368         free(bins);
369         
370         // if no alignments found
371         if ( chunkStarts.empty() ) { return -1; }
372         
373         // else return smallest offset for alignment starts 
374         else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
375 }
376
377 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
378
379         // if on different reference sequence, quit
380         if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
381
382         // read starts after left boundary
383         if ( bAlignment.Position >= m_currentLeft) { return true; }
384
385         // return whether alignment end overlaps left boundary
386         return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
387 }
388
389 bool BamReader::Jump(int refID, unsigned int position) {
390
391         // if index available, and region is valid
392         if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) { 
393                 m_currentRefID = refID;
394                 m_currentLeft  = position;
395                 m_isRegionSpecified = true;
396                 
397                 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
398                 if ( offset == -1 ) { return false; }
399                 else { return BgzfSeek(offset); }
400         }
401         return false;
402 }
403
404 void BamReader::LoadHeaderData(void) {
405         
406         // check to see if proper BAM header
407         char buffer[4];
408         if (BgzfRead(buffer, 4) != 4) { 
409                 printf("Could not read header type\n");
410                 exit(1); 
411         }
412         if (strncmp(buffer, "BAM\001", 4)) {
413                 printf("wrong header type!\n");
414                 exit(1);
415         }
416         
417         // get BAM header text length
418         BgzfRead(buffer, 4);
419         const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
420
421         // get BAM header text
422         char* headerText = (char*)calloc(headerTextLength + 1, 1);
423         BgzfRead(headerText, headerTextLength);
424         m_headerText = (string)((const char*)headerText);
425         
426         // clean up calloc-ed temp variable
427         free(headerText);
428 }
429
430 void BamReader::LoadIndexData(FILE* indexStream) {
431
432         // see if index is valid BAM index
433         char magic[4];
434         fread(magic, 1, 4, indexStream);
435         if (strncmp(magic, "BAI\1", 4)) {
436                 printf("Problem with index file - invalid format.\n");
437                 fclose(indexStream);
438                 exit(1);
439         }
440
441         // get number of reference sequences
442         uint32_t numRefSeqs;
443         fread(&numRefSeqs, 4, 1, indexStream);
444         
445         // intialize BamIndex data structure
446         m_index = new BamIndex;
447         m_index->reserve(numRefSeqs);
448
449         // iterate over reference sequences
450         for (unsigned int i = 0; i < numRefSeqs; ++i) {
451                 
452                 // get number of bins for this reference sequence
453                 int32_t numBins;
454                 fread(&numBins, 4, 1, indexStream);
455                 
456                 if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
457
458                 // intialize BinVector
459                 BinVector* bins = new BinVector;
460                 bins->reserve(numBins);
461                 
462                 // iterate over bins for that reference sequence
463                 for (int j = 0; j < numBins; ++j) {
464                         
465                         // get binID 
466                         uint32_t binID;
467                         fread(&binID, 4, 1, indexStream);
468                         
469                         // get number of regionChunks in this bin
470                         uint32_t numChunks;
471                         fread(&numChunks, 4, 1, indexStream);
472                         
473                         // intialize ChunkVector
474                         ChunkVector* regionChunks = new ChunkVector;
475                         regionChunks->reserve(numChunks);
476                         
477                         // iterate over regionChunks in this bin
478                         for (unsigned int k = 0; k < numChunks; ++k) {
479                                 
480                                 // get chunk boundaries (left, right) 
481                                 uint64_t left;
482                                 uint64_t right;
483                                 fread(&left, 8, 1, indexStream);
484                                 fread(&right, 8, 1, indexStream);
485                                 
486                                 // save ChunkPair
487                                 regionChunks->push_back( ChunkPair(left, right) );
488                         }
489                         
490                         // sort chunks for this bin
491                         sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
492
493                         // save binID, chunkVector for this bin
494                         bins->push_back( BamBin(binID, regionChunks) );
495                 }
496                 
497                 // sort bins by binID
498                 sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
499
500                 // load linear index for this reference sequence
501                 
502                 // get number of linear offsets
503                 int32_t numLinearOffsets;
504                 fread(&numLinearOffsets, 4, 1, indexStream);
505                 
506                 // intialize LinearOffsetVector
507                 LinearOffsetVector* linearOffsets = new LinearOffsetVector;
508                 linearOffsets->reserve(numLinearOffsets);
509                 
510                 // iterate over linear offsets for this reference sequeence
511                 for (int j = 0; j < numLinearOffsets; ++j) {
512                         // get a linear offset
513                         uint64_t linearOffset;
514                         fread(&linearOffset, 8, 1, indexStream);
515                         // store linear offset
516                         linearOffsets->push_back(linearOffset);
517                 }
518                 
519                 // sort linear offsets
520                 sort( linearOffsets->begin(), linearOffsets->end() );
521
522                 // store index data for that reference sequence
523                 m_index->push_back( new RefIndex(bins, linearOffsets) );
524         }
525         
526         // close index file (.bai) and return
527         fclose(indexStream);
528 }
529
530 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
531
532         // read in the 'block length' value, make sure it's not zero
533         char buffer[4];
534         BgzfRead(buffer, 4);
535         const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
536         if ( blockLength == 0 ) { return false; }
537
538         // keep track of bytes read as method progresses
539         int bytesRead = 4;
540
541         // read in core alignment data, make sure the right size of data was read 
542         char x[BAM_CORE_SIZE];
543         if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
544         bytesRead += BAM_CORE_SIZE;
545
546         // set BamAlignment 'core' data and character data lengths
547         unsigned int tempValue;
548         unsigned int queryNameLength;
549         unsigned int numCigarOperations;
550         unsigned int querySequenceLength;
551
552         bAlignment.RefID    = BgzfUnpackUnsignedInt(&x[0]);
553         bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
554
555         tempValue             = BgzfUnpackUnsignedInt(&x[8]);           
556         bAlignment.Bin        = tempValue >> 16;
557         bAlignment.MapQuality = tempValue >> 8 & 0xff;
558         queryNameLength       = tempValue & 0xff;
559
560         tempValue                = BgzfUnpackUnsignedInt(&x[12]);       
561         bAlignment.AlignmentFlag = tempValue >> 16;
562         numCigarOperations       = tempValue & 0xffff;
563
564         querySequenceLength     = BgzfUnpackUnsignedInt(&x[16]);
565         bAlignment.MateRefID    = BgzfUnpackUnsignedInt(&x[20]);
566         bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
567         bAlignment.InsertSize   = BgzfUnpackUnsignedInt(&x[28]);
568
569         // calculate lengths/offsets
570         const unsigned int dataLength      = blockLength - BAM_CORE_SIZE;
571         const unsigned int cigarDataOffset = queryNameLength;
572         const unsigned int seqDataOffset   = cigarDataOffset + (numCigarOperations * 4);
573         const unsigned int qualDataOffset  = seqDataOffset + (querySequenceLength+1)/2;
574         const unsigned int tagDataOffset   = qualDataOffset + querySequenceLength;
575         const unsigned int tagDataLen      = dataLength - tagDataOffset;
576         
577         // set up destination buffers for character data
578         char* allCharData   = (char*)calloc(sizeof(char), dataLength);
579         uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
580         char* seqData       = ((char*)allCharData) + seqDataOffset;
581         char* qualData      = ((char*)allCharData) + qualDataOffset;
582         char* tagData       = ((char*)allCharData) + tagDataOffset;
583         
584         // get character data - make sure proper data size was read
585         if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
586         else {
587                 
588                 bytesRead += dataLength;
589                 
590                 // clear out any previous string data
591                 bAlignment.Name.clear();
592                 bAlignment.QueryBases.clear();
593                 bAlignment.Qualities.clear();
594                 bAlignment.AlignedBases.clear();
595                 bAlignment.CigarData.clear();
596                 bAlignment.TagData.clear();
597                 
598                 // save name
599                 bAlignment.Name = (string)((const char*)(allCharData));
600                 
601                 // save query sequence
602                 for (unsigned int i = 0; i < querySequenceLength; ++i) {        
603                         char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
604                         bAlignment.QueryBases.append( 1, singleBase );
605                 }
606                 
607                 // save sequence length
608                 bAlignment.Length = bAlignment.QueryBases.length();
609                 
610                 // save qualities
611                 for (unsigned int i = 0; i < querySequenceLength; ++i) { 
612                         char singleQuality = (char)(qualData[i]+33);                    // conversion from QV to FASTQ character
613                         bAlignment.Qualities.append( 1, singleQuality );
614                 }
615                 
616                 // save CIGAR-related data;
617                 int k = 0;
618                 for (unsigned int i = 0; i < numCigarOperations; ++i) {
619                         
620                         // build CigarOp struct
621                         CigarOp op;
622                         op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
623                         op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
624                         
625                         // save CigarOp
626                         bAlignment.CigarData.push_back(op);
627                         
628                         // build AlignedBases string
629                         switch (op.Type) {
630                                 
631                                 case ('M') : 
632                                 case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
633                                 case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
634                                                          break;
635                                                          
636                                 case ('D') : bAlignment.AlignedBases.append( op.Length, '-' );  // for 'D' - write gap character
637                                                          break;
638                                                         
639                                 case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'P' - write padding character;
640                                                          break;
641                                                          
642                                 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
643                                                          k += op.Length;
644                                                          break;
645                                                          
646                                 case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
647                                                          
648                                 default    : printf("ERROR: Invalid Cigar op type\n");                  // shouldn't get here
649                                                          exit(1);
650                         }
651                 }
652                 
653                 // read in the tag data
654                 bAlignment.TagData.resize(tagDataLen);
655                 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
656         }
657
658         free(allCharData);
659         return true;
660 }
661
662 void BamReader::LoadReferenceData(void) {
663
664         // get number of reference sequences
665         char buffer[4];
666         BgzfRead(buffer, 4);
667         const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
668         if (numberRefSeqs == 0) { return; }
669         m_references.reserve((int)numberRefSeqs);
670         
671         // iterate over all references in header
672         for (unsigned int i = 0; i != numberRefSeqs; ++i) {
673
674                 // get length of reference name
675                 BgzfRead(buffer, 4);
676                 const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
677                 char* refName = (char*)calloc(refNameLength, 1);
678                 
679                 // get reference name and reference sequence length
680                 BgzfRead(refName, refNameLength);
681                 BgzfRead(buffer, 4);
682                 const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
683                 
684                 // store data for reference
685                 RefData aReference;
686                 aReference.RefName   = (string)((const char*)refName);
687                 aReference.RefLength = refLength;
688                 m_references.push_back(aReference);
689                 
690                 // clean up calloc-ed temp variable
691                 free(refName);
692         }
693 }
694
695 // opens BAM file (and index)
696 void BamReader::Open(const string& filename, const string& indexFilename) {
697
698         // open the BGZF file for reading, retrieve header text & reference data
699         BgzfOpen(filename);
700         LoadHeaderData();       
701         LoadReferenceData();
702
703         // store file offset of first alignment
704         m_alignmentsBeginOffset = BgzfTell();
705
706         // open index file & load index data (if exists)
707         OpenIndex(indexFilename);
708 }
709
710 void BamReader::OpenIndex(const string& indexFilename) {
711
712         // if index file exists
713         if (!indexFilename.empty()) {
714
715                 // open index
716                 FILE* indexStream = fopen(indexFilename.c_str(), "rb");
717                 
718                 // abort on error
719                 if(!indexStream) {
720                         printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
721                         exit(1);
722                 }
723         
724                 // build up index data structure
725                 LoadIndexData(indexStream);
726         }
727 }
728
729 bool BamReader::Rewind(void) {
730
731         // find first reference that has alignments in the BAM file
732         int refID = 0;
733         int refCount = m_references.size();
734         for ( ; refID < refCount; ++refID ) {
735                 if ( m_references.at(refID).RefHasAlignments ) { break; } 
736         }
737
738         // store default bounds for first alignment
739         m_currentRefID = refID;
740         m_currentLeft = 0;
741         m_isRegionSpecified = false;
742
743         // return success/failure of seek
744         return BgzfSeek(m_alignmentsBeginOffset);
745 }