-// BamReader.cpp\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 6 April 2009\r
-\r
-#include "BamReader.h"\r
-#include <iostream>\r
-using std::cerr;\r
-using std::endl;\r
-\r
-// static character constants\r
-const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";\r
-const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";\r
-\r
-BamReader::BamReader(const char* filename, const char* indexFilename) \r
- : m_filename( (char*)filename )\r
- , m_indexFilename( (char*)indexFilename )\r
- , m_file(0)\r
- , m_index(NULL)\r
- , m_headerText("")\r
- , m_isOpen(false)\r
- , m_isIndexLoaded(false)\r
- , m_isRegionSpecified(false)\r
- , m_isCalculateAlignedBases(true)\r
- , m_currentRefID(0)\r
- , m_currentLeft(0)\r
- , m_alignmentsBeginOffset(0)\r
-{\r
- Open();\r
-}\r
-\r
-BamReader::~BamReader(void) {\r
-\r
- // close file\r
- if ( m_isOpen ) { Close(); }\r
-}\r
-\r
-// open BAM file\r
-bool BamReader::Open(void) {\r
-\r
- if (!m_isOpen && m_filename != NULL ) {\r
-\r
- // open file\r
- m_file = bam_open(m_filename, "r"); \r
- \r
- // get header info && index info\r
- if ( (m_file != NULL) && LoadHeader() ) {\r
-\r
- // save file offset where alignments start\r
- m_alignmentsBeginOffset = bam_tell(m_file);\r
- \r
- // set open flag\r
- m_isOpen = true;\r
- }\r
-\r
- // try to open (and load) index data, if index file given\r
- if ( m_indexFilename != NULL ) {\r
- OpenIndex();\r
- }\r
- }\r
-\r
- return m_isOpen;\r
-}\r
-\r
-bool BamReader::OpenIndex(void) {\r
-\r
- if ( m_indexFilename && !m_isIndexLoaded ) {\r
- m_isIndexLoaded = LoadIndex();\r
- }\r
- return m_isIndexLoaded;\r
-}\r
-\r
-// close BAM file\r
-bool BamReader::Close(void) {\r
- \r
- if (m_isOpen) {\r
-\r
- // close file\r
- int ret = bam_close(m_file);\r
- \r
- // delete index info\r
- if ( m_index != NULL) { delete m_index; }\r
-\r
- // clear open flag\r
- m_isOpen = false;\r
-\r
- // clear index flag\r
- m_isIndexLoaded = false;\r
-\r
- // clear region flag\r
- m_isRegionSpecified = false;\r
-\r
- // return success/fail of bam_close\r
- return (ret == 0);\r
- } \r
-\r
- return true;\r
-}\r
-\r
-// get BAM filename\r
-const char* BamReader::Filename(void) const { \r
- return (const char*)m_filename; \r
-}\r
-\r
-// set BAM filename\r
-void BamReader::SetFilename(const char* filename) {\r
- m_filename = (char*)filename;\r
-}\r
-\r
-// get BAM Index filename\r
-const char* BamReader::IndexFilename(void) const { \r
- return (const char*)m_indexFilename; \r
-}\r
-\r
-// set BAM Index filename\r
-void BamReader::SetIndexFilename(const char* indexFilename) {\r
- m_indexFilename = (char*)indexFilename;\r
-}\r
-\r
-// return full header text\r
-const string BamReader::GetHeaderText(void) const { \r
- return m_headerText; \r
-}\r
-\r
-// return number of reference sequences in BAM file\r
-const int BamReader::GetReferenceCount(void) const { \r
- return m_references.size();\r
-}\r
-\r
-// get RefID from reference name\r
-const int BamReader::GetRefID(string refName) const { \r
- \r
- vector<string> refNames;\r
- RefVector::const_iterator refIter = m_references.begin();\r
- RefVector::const_iterator refEnd = m_references.end();\r
- for ( ; refIter != refEnd; ++refIter) {\r
- refNames.push_back( (*refIter).RefName );\r
- }\r
-\r
- // return 'index-of' refName (if not found, returns refNames.size())\r
- return Index( refNames.begin(), refNames.end(), refName );\r
-}\r
-\r
-const RefVector BamReader::GetReferenceData(void) const {\r
- return m_references;\r
-}\r
-\r
-bool BamReader::Jump(int refID, unsigned int left) {\r
-\r
- // if index available, and region is valid\r
- if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { \r
- m_currentRefID = refID;\r
- m_currentLeft = left;\r
- m_isRegionSpecified = true;\r
- return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );\r
- }\r
- return false;\r
-}\r
-\r
-bool BamReader::Rewind(void) {\r
-\r
- int refID = 0;\r
- int refCount = m_references.size();\r
- for ( ; refID < refCount; ++refID ) {\r
- if ( m_references.at(refID).RefHasAlignments ) { break; } \r
- }\r
-\r
- m_currentRefID = refID;\r
- m_currentLeft = 0;\r
- m_isRegionSpecified = false;\r
-\r
- return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );\r
-} \r
-\r
-// get next alignment from specified region\r
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {\r
-\r
- // try to load 'next' read\r
- if ( LoadNextAlignment(bAlignment) ) {\r
-\r
- // if specified region, check for region overlap\r
- if ( m_isRegionSpecified ) {\r
-\r
- // if overlap, return true\r
- if ( IsOverlap(bAlignment) ) { return true; }\r
- // if not, try the next alignment\r
- else { return GetNextAlignment(bAlignment); }\r
- } \r
-\r
- // not using region, valid read detected, return success\r
- else { return true; }\r
- }\r
-\r
- // no valid alignment to load\r
- return false;\r
-}\r
-\r
-void BamReader::SetCalculateAlignedBases(bool flag) {\r
- m_isCalculateAlignedBases = flag;\r
-}\r
-\r
-int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {\r
-\r
- // get region boundaries\r
- uint32_t begin = left;\r
- uint32_t end = m_references.at(refID).RefLength - 1;\r
-\r
- // initialize list, bin '0' always a valid bin\r
- int i = 0;\r
- list[i++] = 0;\r
-\r
- // get rest of bins that contain this region\r
- unsigned int k;\r
- for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }\r
- for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }\r
- for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }\r
- for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }\r
- for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
- \r
- // return number of bins stored\r
- return i;\r
-}\r
-\r
-uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {\r
-\r
- // initialize alignment end to starting position\r
- uint32_t alignEnd = position;\r
-\r
- // iterate over cigar operations\r
- vector<CigarOp>::const_iterator cigarIter = cigarData.begin();\r
- vector<CigarOp>::const_iterator cigarEnd = cigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
- if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {\r
- alignEnd += (*cigarIter).Length;\r
- }\r
- }\r
- return alignEnd;\r
-}\r
-\r
-uint64_t BamReader::GetOffset(int refID, unsigned int left) {\r
-\r
- // make space for bins\r
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); \r
- \r
- // returns number of bins overlapping (left, right)\r
- // stores indices of those bins in 'bins'\r
- int numBins = BinsFromRegion(refID, left, bins); \r
-\r
- // access index data for refID\r
- RefIndex* refIndex = m_index->at(refID);\r
-\r
- // get list of BAM bins for this reference sequence\r
- BinVector* refBins = refIndex->first;\r
-\r
- sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );\r
-\r
- // declare ChunkVector\r
- ChunkVector regionChunks;\r
-\r
- // declaure LinearOffsetVector\r
- LinearOffsetVector* linearOffsets = refIndex->second;\r
-\r
- // some sort of linear offset vs bin index voodoo... not completely sure what's going here\r
- uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);\r
-\r
- BinVector::iterator binBegin = refBins->begin();\r
- BinVector::iterator binEnd = refBins->end();\r
-\r
- // iterate over bins overlapping region, count chunks\r
- for (int i = 0; i < numBins; ++i) {\r
- \r
- // look for bin with ID=bin[i]\r
- BinVector::iterator binIter = binBegin;\r
-\r
- for ( ; binIter != binEnd; ++binIter ) {\r
- \r
- // if found, increment n_off by number of chunks for each bin\r
- if ( (*binIter).first == (uint32_t)bins[i] ) { \r
- \r
- // iterate over chunks in that bin\r
- ChunkVector* binChunks = (*binIter).second;\r
- \r
- ChunkVector::iterator chunkIter = binChunks->begin();\r
- ChunkVector::iterator chunkEnd = binChunks->end();\r
- for ( ; chunkIter != chunkEnd; ++chunkIter) {\r
- \r
- // if right bound of pair is greater than min_off (linear offset value), store pair\r
- if ( (*chunkIter).second > minOffset) { \r
- regionChunks.push_back( (*chunkIter) ); \r
- }\r
- }\r
- }\r
- }\r
- }\r
-\r
- // clean up temp array\r
- free(bins);\r
-\r
- // there should be at least 1\r
- assert(regionChunks.size() > 0);\r
-\r
- // sort chunks by start position\r
- sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );\r
-\r
- // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing\r
- int numOffsets = regionChunks.size(); \r
- for (int i = 1; i < numOffsets; ++i) {\r
- if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {\r
- regionChunks.at(i-1).second = regionChunks.at(i).first;\r
- }\r
- }\r
- \r
- // merge adjacent chunks\r
- int l = 0;\r
- for (int i = 1; i < numOffsets; ++i) {\r
- // if adjacent, expand boundaries of (merged) chunk\r
- if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {\r
- regionChunks.at(l).second = regionChunks.at(i).second;\r
- }\r
- // else, move on to next (merged) chunk index\r
- else { regionChunks.at(++l) = regionChunks.at(i); }\r
- }\r
-\r
- // return beginning file offset of first chunk for region\r
- return regionChunks.at(0).first;\r
-}\r
-\r
-bool BamReader::IsOverlap(BamAlignment& bAlignment) {\r
-\r
- // if on different reference sequence, quit\r
- if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }\r
-\r
- // read starts after left boundary\r
- if ( bAlignment.Position >= m_currentLeft) { return true; }\r
-\r
- // get alignment end\r
- uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);\r
-\r
- // return whether alignment end overlaps left boundary\r
- return ( alignEnd >= m_currentLeft );\r
-}\r
-\r
-bool BamReader::LoadHeader(void) {\r
-\r
- // check to see if proper BAM header\r
- char buf[4];\r
- if (bam_read(m_file, buf, 4) != 4) { return false; }\r
- if (strncmp(buf, "BAM\001", 4)) {\r
- fprintf(stderr, "wrong header type!\n");\r
- return false;\r
- }\r
- \r
- // get BAM header text length\r
- int32_t headerTextLength;\r
- bam_read(m_file, &headerTextLength, 4);\r
-\r
- // get BAM header text\r
- char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
- bam_read(m_file, headerText, headerTextLength);\r
- m_headerText = (string)((const char*)headerText);\r
- \r
- // clean up calloc-ed temp variable\r
- free(headerText);\r
-\r
- // get number of reference sequences\r
- int32_t numberRefSeqs;\r
- bam_read(m_file, &numberRefSeqs, 4);\r
- if (numberRefSeqs == 0) { return false; }\r
-\r
- m_references.reserve((int)numberRefSeqs);\r
- \r
- // reference variables\r
- int32_t refNameLength;\r
- char* refName;\r
- uint32_t refLength;\r
-\r
- // iterate over all references in header\r
- for (int i = 0; i != numberRefSeqs; ++i) {\r
-\r
- // get length of reference name\r
- bam_read(m_file, &refNameLength, 4);\r
- refName = (char*)calloc(refNameLength, 1);\r
-\r
- // get reference name and reference sequence length\r
- bam_read(m_file, refName, refNameLength);\r
- bam_read(m_file, &refLength, 4);\r
-\r
- // store data for reference\r
- RefData aReference;\r
- aReference.RefName = (string)((const char*)refName);\r
- aReference.RefLength = refLength;\r
- m_references.push_back(aReference);\r
-\r
- // clean up calloc-ed temp variable\r
- free(refName);\r
- }\r
- \r
- return true;\r
-}\r
-\r
-bool BamReader::LoadIndex(void) {\r
-\r
- // check to see if index file exists\r
- FILE* indexFile;\r
- if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {\r
- fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");\r
- return false;\r
- }\r
-\r
- // see if index is valid BAM index\r
- char magic[4];\r
- fread(magic, 1, 4, indexFile);\r
- if (strncmp(magic, "BAI\1", 4)) {\r
- fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");\r
- fclose(indexFile);\r
- return false;\r
- }\r
-\r
- // get number of reference sequences\r
- uint32_t numRefSeqs;\r
- fread(&numRefSeqs, 4, 1, indexFile);\r
- \r
- // intialize BamIndex data structure\r
- m_index = new BamIndex;\r
- m_index->reserve(numRefSeqs);\r
-\r
- // iterate over reference sequences\r
- for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
- \r
- // get number of bins for this reference sequence\r
- int32_t numBins;\r
- fread(&numBins, 4, 1, indexFile);\r
- \r
- if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }\r
-\r
- // intialize BinVector\r
- BinVector* bins = new BinVector;\r
- bins->reserve(numBins);\r
- \r
- // iterate over bins for that reference sequence\r
- for (int j = 0; j < numBins; ++j) {\r
- \r
- // get binID \r
- uint32_t binID;\r
- fread(&binID, 4, 1, indexFile);\r
- \r
- // get number of regionChunks in this bin\r
- uint32_t numChunks;\r
- fread(&numChunks, 4, 1, indexFile);\r
- \r
- // intialize ChunkVector\r
- ChunkVector* regionChunks = new ChunkVector;\r
- regionChunks->reserve(numChunks);\r
- \r
- // iterate over regionChunks in this bin\r
- for (unsigned int k = 0; k < numChunks; ++k) {\r
- \r
- // get chunk boundaries (left, right) \r
- uint64_t left;\r
- uint64_t right;\r
- fread(&left, 8, 1, indexFile);\r
- fread(&right, 8, 1, indexFile);\r
- \r
- // save ChunkPair\r
- regionChunks->push_back( ChunkPair(left, right) );\r
- }\r
- \r
- // save binID, chunkVector for this bin\r
- bins->push_back( BamBin(binID, regionChunks) );\r
- }\r
- \r
- // load linear index for this reference sequence\r
- \r
- // get number of linear offsets\r
- int32_t numLinearOffsets;\r
- fread(&numLinearOffsets, 4, 1, indexFile);\r
- \r
- // intialize LinearOffsetVector\r
- LinearOffsetVector* linearOffsets = new LinearOffsetVector;\r
- linearOffsets->reserve(numLinearOffsets);\r
- \r
- // iterate over linear offsets for this reference sequeence\r
- for (int j = 0; j < numLinearOffsets; ++j) {\r
- // get a linear offset\r
- uint64_t linearOffset;\r
- fread(&linearOffset, 8, 1, indexFile);\r
- // store linear offset\r
- linearOffsets->push_back(linearOffset);\r
- }\r
- \r
- // store index data for that reference sequence\r
- m_index->push_back( new RefIndex(bins, linearOffsets) );\r
- }\r
- \r
- // close index file (.bai) and return\r
- fclose(indexFile);\r
- return true;\r
-}\r
-\r
-bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {\r
-\r
- // check valid alignment block header data\r
- int32_t block_len;\r
- int32_t ret;\r
- uint32_t x[8];\r
-\r
- int32_t bytesRead = 0;\r
-\r
- // read in the 'block length' value, make sure it's not zero\r
- if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; }\r
- bytesRead += 4;\r
-\r
- // read in core alignment data, make the right size of data was read \r
- if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
- bytesRead += BAM_CORE_SIZE;\r
-\r
- // set BamAlignment 'core' data\r
- bAlignment.RefID = x[0]; \r
- bAlignment.Position = x[1];\r
- bAlignment.Bin = x[2]>>16; \r
- bAlignment.MapQuality = x[2]>>8&0xff; \r
- bAlignment.AlignmentFlag = x[3]>>16; \r
- bAlignment.MateRefID = x[5]; \r
- bAlignment.MatePosition = x[6]; \r
- bAlignment.InsertSize = x[7];\r
-\r
- // fetch & store often-used lengths for character data parsing\r
- unsigned int queryNameLength = x[2]&0xff;\r
- unsigned int numCigarOperations = x[3]&0xffff;\r
- unsigned int querySequenceLength = x[4];\r
- \r
- // get length of character data\r
- int dataLength = block_len - BAM_CORE_SIZE;\r
-\r
- // set up destination buffers for character data\r
- uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);\r
- uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength);\r
- \r
- // get character data - make sure proper data size was read\r
- if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }\r
- else {\r
-\r
- bytesRead += dataLength;\r
-\r
- // clear out bases, qualities, aligned bases, and CIGAR\r
- bAlignment.QueryBases.clear();\r
- bAlignment.Qualities.clear();\r
- bAlignment.AlignedBases.clear();\r
- bAlignment.CigarData.clear();\r
-\r
- // save name\r
- bAlignment.Name = (string)((const char*)(allCharData));\r
- \r
- // save bases\r
- char singleBase[2];\r
- uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);\r
- for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
- // numeric to char conversion\r
- sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );\r
- // append character data to Bases\r
- bAlignment.QueryBases.append( (const char*)singleBase );\r
- }\r
- \r
- // save sequence length\r
- bAlignment.Length = bAlignment.QueryBases.length();\r
- \r
- // save qualities\r
- char singleQuality[4];\r
- uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);\r
- for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
- // numeric to char conversion\r
- sprintf( singleQuality, "%c", ( t[i]+33 ) ); \r
- // append character data to Qualities\r
- bAlignment.Qualities.append( (const char*)singleQuality );\r
- }\r
- \r
- // save CIGAR-related data;\r
- int k = 0;\r
- for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
-\r
- // build CigarOp struct\r
- CigarOp op;\r
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
-\r
- // save CigarOp\r
- bAlignment.CigarData.push_back(op);\r
-\r
- // can skip this step if user wants to ignore\r
- if (m_isCalculateAlignedBases) {\r
-\r
- // build AlignedBases string\r
- switch (op.Type) {\r
- \r
- case ('M') : \r
- case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
- case ('S') : k += op.Length; // for 'S' - skip over query bases\r
- break;\r
- \r
- case ('D') : \r
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;\r
- break;\r
- \r
- case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence\r
- k += op.Length;\r
- break;\r
-\r
- case ('H') : break; // for 'H' - do nothing, move to next op\r
- \r
- default : assert(false); // shouldn't get here\r
- break;\r
- }\r
- }\r
- } \r
- }\r
- free(allCharData);\r
-\r
-/*\r
- // (optional) read tag parsing data\r
- string tag;\r
- char data;\r
- int i = 0;\r
-\r
- // still data to read (auxiliary tags)\r
- while ( bytesRead < block_len ) {\r
-\r
- if ( bam_read(m_file, &data, 1) == 1 ) { \r
- \r
- ++bytesRead;\r
-\r
- if (bytesRead == block_len && data != '\0') {\r
- fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");\r
- exit(1);\r
- }\r
-\r
- tag.append(1, data);\r
- if ( data == '\0' ) {\r
- bAlignment.Tags.push_back(tag);\r
- tag = "";\r
- i = 0;\r
- } else {\r
- if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }\r
- ++i;\r
- }\r
- } else {\r
- fprintf(stderr, "ERROR: Could not read tag info.\n");\r
- exit(1);\r
- }\r
- }\r
-*/\r
- return true;\r
-}\r
+// BamReader.cpp
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 6 April 2009
+
+#include "BamReader.h"
+#include <iostream>
+using std::cerr;
+using std::endl;
+
+// static character constants
+const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
+const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
+
+BamReader::BamReader(const char* filename, const char* indexFilename)
+ : m_filename( (char*)filename )
+ , m_indexFilename( (char*)indexFilename )
+ , m_file(0)
+ , m_index(NULL)
+ , m_headerText("")
+ , m_isOpen(false)
+ , m_isIndexLoaded(false)
+ , m_isRegionSpecified(false)
+ , m_isCalculateAlignedBases(true)
+ , m_currentRefID(0)
+ , m_currentLeft(0)
+ , m_alignmentsBeginOffset(0)
+{
+ Open();
+}
+
+BamReader::~BamReader(void) {
+
+ // close file
+ if ( m_isOpen ) { Close(); }
+}
+
+// open BAM file
+bool BamReader::Open(void) {
+
+ if (!m_isOpen && m_filename != NULL ) {
+
+ // open file
+ m_file = bam_open(m_filename, "r");
+
+ // get header info && index info
+ if ( (m_file != NULL) && LoadHeader() ) {
+
+ // save file offset where alignments start
+ m_alignmentsBeginOffset = bam_tell(m_file);
+
+ // set open flag
+ m_isOpen = true;
+ }
+
+ // try to open (and load) index data, if index file given
+ if ( m_indexFilename != NULL ) {
+ OpenIndex();
+ }
+ }
+
+ return m_isOpen;
+}
+
+bool BamReader::OpenIndex(void) {
+
+ if ( m_indexFilename && !m_isIndexLoaded ) {
+ m_isIndexLoaded = LoadIndex();
+ }
+ return m_isIndexLoaded;
+}
+
+// close BAM file
+bool BamReader::Close(void) {
+
+ if (m_isOpen) {
+
+ // close file
+ int ret = bam_close(m_file);
+
+ // delete index info
+ if ( m_index != NULL) { delete m_index; }
+
+ // clear open flag
+ m_isOpen = false;
+
+ // clear index flag
+ m_isIndexLoaded = false;
+
+ // clear region flag
+ m_isRegionSpecified = false;
+
+ // return success/fail of bam_close
+ return (ret == 0);
+ }
+
+ return true;
+}
+
+// get BAM filename
+const char* BamReader::Filename(void) const {
+ return (const char*)m_filename;
+}
+
+// set BAM filename
+void BamReader::SetFilename(const char* filename) {
+ m_filename = (char*)filename;
+}
+
+// get BAM Index filename
+const char* BamReader::IndexFilename(void) const {
+ return (const char*)m_indexFilename;
+}
+
+// set BAM Index filename
+void BamReader::SetIndexFilename(const char* indexFilename) {
+ m_indexFilename = (char*)indexFilename;
+}
+
+// return full header text
+const string BamReader::GetHeaderText(void) const {
+ return m_headerText;
+}
+
+// return number of reference sequences in BAM file
+const int BamReader::GetReferenceCount(void) const {
+ return m_references.size();
+}
+
+// get RefID from reference name
+const int BamReader::GetRefID(string refName) const {
+
+ vector<string> refNames;
+ RefVector::const_iterator refIter = m_references.begin();
+ RefVector::const_iterator refEnd = m_references.end();
+ for ( ; refIter != refEnd; ++refIter) {
+ refNames.push_back( (*refIter).RefName );
+ }
+
+ // return 'index-of' refName (if not found, returns refNames.size())
+ return Index( refNames.begin(), refNames.end(), refName );
+}
+
+const RefVector BamReader::GetReferenceData(void) const {
+ return m_references;
+}
+
+bool BamReader::Jump(int refID, unsigned int left) {
+
+ // if index available, and region is valid
+ if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
+ m_currentRefID = refID;
+ m_currentLeft = left;
+ m_isRegionSpecified = true;
+ return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );
+ }
+ return false;
+}
+
+bool BamReader::Rewind(void) {
+
+ int refID = 0;
+ int refCount = m_references.size();
+ for ( ; refID < refCount; ++refID ) {
+ if ( m_references.at(refID).RefHasAlignments ) { break; }
+ }
+
+ m_currentRefID = refID;
+ m_currentLeft = 0;
+ m_isRegionSpecified = false;
+
+ return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
+}
+
+// get next alignment from specified region
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
+
+ // try to load 'next' read
+ if ( LoadNextAlignment(bAlignment) ) {
+
+ // if specified region, check for region overlap
+ if ( m_isRegionSpecified ) {
+
+ // if overlap, return true
+ if ( IsOverlap(bAlignment) ) { return true; }
+ // if not, try the next alignment
+ else { return GetNextAlignment(bAlignment); }
+ }
+
+ // not using region, valid read detected, return success
+ else { return true; }
+ }
+
+ // no valid alignment to load
+ return false;
+}
+
+void BamReader::SetCalculateAlignedBases(bool flag) {
+ m_isCalculateAlignedBases = flag;
+}
+
+int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
+
+ // get region boundaries
+ uint32_t begin = left;
+ uint32_t end = m_references.at(refID).RefLength - 1;
+
+ // initialize list, bin '0' always a valid bin
+ int i = 0;
+ list[i++] = 0;
+
+ // get rest of bins that contain this region
+ unsigned int k;
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
+
+ // return number of bins stored
+ return i;
+}
+
+uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
+
+ // initialize alignment end to starting position
+ uint32_t alignEnd = position;
+
+ // iterate over cigar operations
+ vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {
+ if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
+ alignEnd += (*cigarIter).Length;
+ }
+ }
+ return alignEnd;
+}
+
+uint64_t BamReader::GetOffset(int refID, unsigned int left) {
+
+ // make space for bins
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+
+ // returns number of bins overlapping (left, right)
+ // stores indices of those bins in 'bins'
+ int numBins = BinsFromRegion(refID, left, bins);
+
+ // access index data for refID
+ RefIndex* refIndex = m_index->at(refID);
+
+ // get list of BAM bins for this reference sequence
+ BinVector* refBins = refIndex->first;
+
+ sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+
+ // declare ChunkVector
+ ChunkVector regionChunks;
+
+ // declaure LinearOffsetVector
+ LinearOffsetVector* linearOffsets = refIndex->second;
+
+ // some sort of linear offset vs bin index voodoo... not completely sure what's going here
+ uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+
+ BinVector::iterator binBegin = refBins->begin();
+ BinVector::iterator binEnd = refBins->end();
+
+ // iterate over bins overlapping region, count chunks
+ for (int i = 0; i < numBins; ++i) {
+
+ // look for bin with ID=bin[i]
+ BinVector::iterator binIter = binBegin;
+
+ for ( ; binIter != binEnd; ++binIter ) {
+
+ // if found, increment n_off by number of chunks for each bin
+ if ( (*binIter).first == (uint32_t)bins[i] ) {
+
+ // iterate over chunks in that bin
+ ChunkVector* binChunks = (*binIter).second;
+
+ ChunkVector::iterator chunkIter = binChunks->begin();
+ ChunkVector::iterator chunkEnd = binChunks->end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter) {
+
+ // if right bound of pair is greater than min_off (linear offset value), store pair
+ if ( (*chunkIter).second > minOffset) {
+ regionChunks.push_back( (*chunkIter) );
+ }
+ }
+ }
+ }
+ }
+
+ // clean up temp array
+ free(bins);
+
+ // there should be at least 1
+ assert(regionChunks.size() > 0);
+
+ // sort chunks by start position
+ sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
+
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+ int numOffsets = regionChunks.size();
+ for (int i = 1; i < numOffsets; ++i) {
+ if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
+ regionChunks.at(i-1).second = regionChunks.at(i).first;
+ }
+ }
+
+ // merge adjacent chunks
+ int l = 0;
+ for (int i = 1; i < numOffsets; ++i) {
+ // if adjacent, expand boundaries of (merged) chunk
+ if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
+ regionChunks.at(l).second = regionChunks.at(i).second;
+ }
+ // else, move on to next (merged) chunk index
+ else { regionChunks.at(++l) = regionChunks.at(i); }
+ }
+
+ // return beginning file offset of first chunk for region
+ return regionChunks.at(0).first;
+}
+
+bool BamReader::IsOverlap(BamAlignment& bAlignment) {
+
+ // if on different reference sequence, quit
+ if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
+
+ // read starts after left boundary
+ if ( bAlignment.Position >= m_currentLeft) { return true; }
+
+ // get alignment end
+ uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
+
+ // return whether alignment end overlaps left boundary
+ return ( alignEnd >= m_currentLeft );
+}
+
+bool BamReader::LoadHeader(void) {
+
+ // check to see if proper BAM header
+ char buf[4];
+ if (bam_read(m_file, buf, 4) != 4) { return false; }
+ if (strncmp(buf, "BAM\001", 4)) {
+ fprintf(stderr, "wrong header type!\n");
+ return false;
+ }
+
+ // get BAM header text length
+ int32_t headerTextLength;
+ bam_read(m_file, &headerTextLength, 4);
+
+ // get BAM header text
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);
+ bam_read(m_file, headerText, headerTextLength);
+ m_headerText = (string)((const char*)headerText);
+
+ // clean up calloc-ed temp variable
+ free(headerText);
+
+ // get number of reference sequences
+ int32_t numberRefSeqs;
+ bam_read(m_file, &numberRefSeqs, 4);
+ if (numberRefSeqs == 0) { return false; }
+
+ m_references.reserve((int)numberRefSeqs);
+
+ // reference variables
+ int32_t refNameLength;
+ char* refName;
+ uint32_t refLength;
+
+ // iterate over all references in header
+ for (int i = 0; i != numberRefSeqs; ++i) {
+
+ // get length of reference name
+ bam_read(m_file, &refNameLength, 4);
+ refName = (char*)calloc(refNameLength, 1);
+
+ // get reference name and reference sequence length
+ bam_read(m_file, refName, refNameLength);
+ bam_read(m_file, &refLength, 4);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName);
+ aReference.RefLength = refLength;
+ m_references.push_back(aReference);
+
+ // clean up calloc-ed temp variable
+ free(refName);
+ }
+
+ return true;
+}
+
+bool BamReader::LoadIndex(void) {
+
+ // check to see if index file exists
+ FILE* indexFile;
+ if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
+ fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
+ return false;
+ }
+
+ // see if index is valid BAM index
+ char magic[4];
+ fread(magic, 1, 4, indexFile);
+ if (strncmp(magic, "BAI\1", 4)) {
+ fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
+ fclose(indexFile);
+ return false;
+ }
+
+ // get number of reference sequences
+ uint32_t numRefSeqs;
+ fread(&numRefSeqs, 4, 1, indexFile);
+
+ // intialize BamIndex data structure
+ m_index = new BamIndex;
+ m_index->reserve(numRefSeqs);
+
+ // iterate over reference sequences
+ for (unsigned int i = 0; i < numRefSeqs; ++i) {
+
+ // get number of bins for this reference sequence
+ int32_t numBins;
+ fread(&numBins, 4, 1, indexFile);
+
+ if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
+
+ // intialize BinVector
+ BinVector* bins = new BinVector;
+ bins->reserve(numBins);
+
+ // iterate over bins for that reference sequence
+ for (int j = 0; j < numBins; ++j) {
+
+ // get binID
+ uint32_t binID;
+ fread(&binID, 4, 1, indexFile);
+
+ // get number of regionChunks in this bin
+ uint32_t numChunks;
+ fread(&numChunks, 4, 1, indexFile);
+
+ // intialize ChunkVector
+ ChunkVector* regionChunks = new ChunkVector;
+ regionChunks->reserve(numChunks);
+
+ // iterate over regionChunks in this bin
+ for (unsigned int k = 0; k < numChunks; ++k) {
+
+ // get chunk boundaries (left, right)
+ uint64_t left;
+ uint64_t right;
+ fread(&left, 8, 1, indexFile);
+ fread(&right, 8, 1, indexFile);
+
+ // save ChunkPair
+ regionChunks->push_back( ChunkPair(left, right) );
+ }
+
+ // save binID, chunkVector for this bin
+ bins->push_back( BamBin(binID, regionChunks) );
+ }
+
+ // load linear index for this reference sequence
+
+ // get number of linear offsets
+ int32_t numLinearOffsets;
+ fread(&numLinearOffsets, 4, 1, indexFile);
+
+ // intialize LinearOffsetVector
+ LinearOffsetVector* linearOffsets = new LinearOffsetVector;
+ linearOffsets->reserve(numLinearOffsets);
+
+ // iterate over linear offsets for this reference sequeence
+ for (int j = 0; j < numLinearOffsets; ++j) {
+ // get a linear offset
+ uint64_t linearOffset;
+ fread(&linearOffset, 8, 1, indexFile);
+ // store linear offset
+ linearOffsets->push_back(linearOffset);
+ }
+
+ // store index data for that reference sequence
+ m_index->push_back( new RefIndex(bins, linearOffsets) );
+ }
+
+ // close index file (.bai) and return
+ fclose(indexFile);
+ return true;
+}
+
+bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
+
+ // check valid alignment block header data
+ int32_t block_len;
+ int32_t ret;
+ uint32_t x[8];
+
+ int32_t bytesRead = 0;
+
+ // read in the 'block length' value, make sure it's not zero
+ if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; }
+ bytesRead += 4;
+
+ // read in core alignment data, make the right size of data was read
+ if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
+ bytesRead += BAM_CORE_SIZE;
+
+ // set BamAlignment 'core' data
+ bAlignment.RefID = x[0];
+ bAlignment.Position = x[1];
+ bAlignment.Bin = x[2]>>16;
+ bAlignment.MapQuality = x[2]>>8&0xff;
+ bAlignment.AlignmentFlag = x[3]>>16;
+ bAlignment.MateRefID = x[5];
+ bAlignment.MatePosition = x[6];
+ bAlignment.InsertSize = x[7];
+
+ // fetch & store often-used lengths for character data parsing
+ unsigned int queryNameLength = x[2]&0xff;
+ unsigned int numCigarOperations = x[3]&0xffff;
+ unsigned int querySequenceLength = x[4];
+
+ // get length of character data
+ int dataLength = block_len - BAM_CORE_SIZE;
+
+ // set up destination buffers for character data
+ uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
+ uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength);
+
+ const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
+ const unsigned int tagDataLen = dataLength - tagDataOffset;
+ char* tagData = ((char*)allCharData) + tagDataOffset;
+
+ // get character data - make sure proper data size was read
+ if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
+ else {
+
+ bytesRead += dataLength;
+
+ // clear out bases, qualities, aligned bases, CIGAR, and tag data
+ bAlignment.QueryBases.clear();
+ bAlignment.Qualities.clear();
+ bAlignment.AlignedBases.clear();
+ bAlignment.CigarData.clear();
+ bAlignment.TagData.clear();
+
+ // save name
+ bAlignment.Name = (string)((const char*)(allCharData));
+
+ // save bases
+ char singleBase[2];
+ uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
+ for (unsigned int i = 0; i < querySequenceLength; ++i) {
+ // numeric to char conversion
+ sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
+ // append character data to Bases
+ bAlignment.QueryBases.append( (const char*)singleBase );
+ }
+
+ // save sequence length
+ bAlignment.Length = bAlignment.QueryBases.length();
+
+ // save qualities
+ char singleQuality[4];
+ uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
+ for (unsigned int i = 0; i < querySequenceLength; ++i) {
+ // numeric to char conversion
+ sprintf( singleQuality, "%c", ( t[i]+33 ) );
+ // append character data to Qualities
+ bAlignment.Qualities.append( (const char*)singleQuality );
+ }
+
+ // save CIGAR-related data;
+ int k = 0;
+ for (unsigned int i = 0; i < numCigarOperations; ++i) {
+
+ // build CigarOp struct
+ CigarOp op;
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
+
+ // save CigarOp
+ bAlignment.CigarData.push_back(op);
+
+ // can skip this step if user wants to ignore
+ if (m_isCalculateAlignedBases) {
+
+ // build AlignedBases string
+ switch (op.Type) {
+
+ case ('M') :
+ case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
+ case ('S') : k += op.Length; // for 'S' - skip over query bases
+ break;
+
+ case ('D') :
+ case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
+ break;
+
+ case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
+ k += op.Length;
+ break;
+
+ case ('H') : break; // for 'H' - do nothing, move to next op
+
+ default : assert(false); // shouldn't get here
+ break;
+ }
+ }
+ }
+
+ // read in the tag data
+ bAlignment.TagData.resize(tagDataLen);
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
+ }
+ free(allCharData);
+
+/*
+ // (optional) read tag parsing data
+ string tag;
+ char data;
+ int i = 0;
+
+ // still data to read (auxiliary tags)
+ while ( bytesRead < block_len ) {
+
+ if ( bam_read(m_file, &data, 1) == 1 ) {
+
+ ++bytesRead;
+
+ if (bytesRead == block_len && data != '\0') {
+ fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
+ exit(1);
+ }
+
+ tag.append(1, data);
+ if ( data == '\0' ) {
+ bAlignment.Tags.push_back(tag);
+ tag = "";
+ i = 0;
+ } else {
+ if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
+ ++i;
+ }
+ } else {
+ fprintf(stderr, "ERROR: Could not read tag info.\n");
+ exit(1);
+ }
+ }
+*/
+ return true;
+}