-// BamReader.cpp
-
-// Derek Barnett
-// Marth Lab, Boston College
-// Last modified: 6 April 2009
-
-#include "BamReader.h"
-#include <iostream>
-using std::cerr;
-using std::endl;
-
-// static character constants
-const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
-const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
-
-BamReader::BamReader(const char* filename, const char* indexFilename)
- : m_filename( (char*)filename )
- , m_indexFilename( (char*)indexFilename )
- , m_file(0)
- , m_index(NULL)
- , m_headerText("")
- , m_isOpen(false)
- , m_isIndexLoaded(false)
- , m_isRegionSpecified(false)
- , m_isCalculateAlignedBases(true)
- , m_currentRefID(0)
- , m_currentLeft(0)
- , m_alignmentsBeginOffset(0)
-{
- Open();
-}
-
-BamReader::~BamReader(void) {
-
- // close file
- if ( m_isOpen ) { Close(); }
-}
-
-// open BAM file
-bool BamReader::Open(void) {
-
- if (!m_isOpen && m_filename != NULL ) {
-
- // open file
- m_file = bam_open(m_filename, "r");
-
- // get header info && index info
- if ( (m_file != NULL) && LoadHeader() ) {
-
- // save file offset where alignments start
- m_alignmentsBeginOffset = bam_tell(m_file);
-
- // set open flag
- m_isOpen = true;
- }
-
- // try to open (and load) index data, if index file given
- if ( m_indexFilename != NULL ) {
- OpenIndex();
- }
- }
-
- return m_isOpen;
-}
-
-bool BamReader::OpenIndex(void) {
-
- if ( m_indexFilename && !m_isIndexLoaded ) {
- m_isIndexLoaded = LoadIndex();
- }
- return m_isIndexLoaded;
-}
-
-// close BAM file
-bool BamReader::Close(void) {
-
- if (m_isOpen) {
-
- // close file
- int ret = bam_close(m_file);
-
- // delete index info
- if ( m_index != NULL) { delete m_index; }
-
- // clear open flag
- m_isOpen = false;
-
- // clear index flag
- m_isIndexLoaded = false;
-
- // clear region flag
- m_isRegionSpecified = false;
-
- // return success/fail of bam_close
- return (ret == 0);
- }
-
- return true;
-}
-
-// get BAM filename
-const char* BamReader::Filename(void) const {
- return (const char*)m_filename;
-}
-
-// set BAM filename
-void BamReader::SetFilename(const char* filename) {
- m_filename = (char*)filename;
-}
-
-// get BAM Index filename
-const char* BamReader::IndexFilename(void) const {
- return (const char*)m_indexFilename;
-}
-
-// set BAM Index filename
-void BamReader::SetIndexFilename(const char* indexFilename) {
- m_indexFilename = (char*)indexFilename;
-}
-
-// return full header text
-const string BamReader::GetHeaderText(void) const {
- return m_headerText;
-}
-
-// return number of reference sequences in BAM file
-const int BamReader::GetReferenceCount(void) const {
- return m_references.size();
-}
-
-// get RefID from reference name
-const int BamReader::GetRefID(string refName) const {
-
- vector<string> refNames;
- RefVector::const_iterator refIter = m_references.begin();
- RefVector::const_iterator refEnd = m_references.end();
- for ( ; refIter != refEnd; ++refIter) {
- refNames.push_back( (*refIter).RefName );
- }
-
- // return 'index-of' refName (if not found, returns refNames.size())
- return Index( refNames.begin(), refNames.end(), refName );
-}
-
-const RefVector BamReader::GetReferenceData(void) const {
- return m_references;
-}
-
-bool BamReader::Jump(int refID, unsigned int left) {
-
- // if index available, and region is valid
- if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
- m_currentRefID = refID;
- m_currentLeft = left;
- m_isRegionSpecified = true;
- return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );
- }
- return false;
-}
-
-bool BamReader::Rewind(void) {
-
- int refID = 0;
- int refCount = m_references.size();
- for ( ; refID < refCount; ++refID ) {
- if ( m_references.at(refID).RefHasAlignments ) { break; }
- }
-
- m_currentRefID = refID;
- m_currentLeft = 0;
- m_isRegionSpecified = false;
-
- return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
-}
-
-// get next alignment from specified region
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
-
- // try to load 'next' read
- if ( LoadNextAlignment(bAlignment) ) {
-
- // if specified region, check for region overlap
- if ( m_isRegionSpecified ) {
-
- // if overlap, return true
- if ( IsOverlap(bAlignment) ) { return true; }
- // if not, try the next alignment
- else { return GetNextAlignment(bAlignment); }
- }
-
- // not using region, valid read detected, return success
- else { return true; }
- }
-
- // no valid alignment to load
- return false;
-}
-
-void BamReader::SetCalculateAlignedBases(bool flag) {
- m_isCalculateAlignedBases = flag;
-}
-
-int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
-
- // get region boundaries
- uint32_t begin = left;
- uint32_t end = m_references.at(refID).RefLength - 1;
-
- // initialize list, bin '0' always a valid bin
- int i = 0;
- list[i++] = 0;
-
- // get rest of bins that contain this region
- unsigned int k;
- for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
- for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
- for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
- for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
- for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
-
- // return number of bins stored
- return i;
-}
-
-uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
-
- // initialize alignment end to starting position
- uint32_t alignEnd = position;
-
- // iterate over cigar operations
- vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
- vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
- for ( ; cigarIter != cigarEnd; ++cigarIter) {
- if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
- alignEnd += (*cigarIter).Length;
- }
- }
- return alignEnd;
-}
-
-uint64_t BamReader::GetOffset(int refID, unsigned int left) {
-
- // make space for bins
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
-
- // returns number of bins overlapping (left, right)
- // stores indices of those bins in 'bins'
- int numBins = BinsFromRegion(refID, left, bins);
-
- // access index data for refID
- RefIndex* refIndex = m_index->at(refID);
-
- // get list of BAM bins for this reference sequence
- BinVector* refBins = refIndex->first;
-
- sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
-
- // declare ChunkVector
- ChunkVector regionChunks;
-
- // declaure LinearOffsetVector
- LinearOffsetVector* linearOffsets = refIndex->second;
-
- // some sort of linear offset vs bin index voodoo... not completely sure what's going here
- uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
-
- BinVector::iterator binBegin = refBins->begin();
- BinVector::iterator binEnd = refBins->end();
-
- // iterate over bins overlapping region, count chunks
- for (int i = 0; i < numBins; ++i) {
-
- // look for bin with ID=bin[i]
- BinVector::iterator binIter = binBegin;
-
- for ( ; binIter != binEnd; ++binIter ) {
-
- // if found, increment n_off by number of chunks for each bin
- if ( (*binIter).first == (uint32_t)bins[i] ) {
-
- // iterate over chunks in that bin
- ChunkVector* binChunks = (*binIter).second;
-
- ChunkVector::iterator chunkIter = binChunks->begin();
- ChunkVector::iterator chunkEnd = binChunks->end();
- for ( ; chunkIter != chunkEnd; ++chunkIter) {
-
- // if right bound of pair is greater than min_off (linear offset value), store pair
- if ( (*chunkIter).second > minOffset) {
- regionChunks.push_back( (*chunkIter) );
- }
- }
- }
- }
- }
-
- // clean up temp array
- free(bins);
-
- // there should be at least 1
- assert(regionChunks.size() > 0);
-
- // sort chunks by start position
- sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
-
- // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
- int numOffsets = regionChunks.size();
- for (int i = 1; i < numOffsets; ++i) {
- if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
- regionChunks.at(i-1).second = regionChunks.at(i).first;
- }
- }
-
- // merge adjacent chunks
- int l = 0;
- for (int i = 1; i < numOffsets; ++i) {
- // if adjacent, expand boundaries of (merged) chunk
- if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
- regionChunks.at(l).second = regionChunks.at(i).second;
- }
- // else, move on to next (merged) chunk index
- else { regionChunks.at(++l) = regionChunks.at(i); }
- }
-
- // return beginning file offset of first chunk for region
- return regionChunks.at(0).first;
-}
-
-bool BamReader::IsOverlap(BamAlignment& bAlignment) {
-
- // if on different reference sequence, quit
- if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
-
- // read starts after left boundary
- if ( bAlignment.Position >= m_currentLeft) { return true; }
-
- // get alignment end
- uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
-
- // return whether alignment end overlaps left boundary
- return ( alignEnd >= m_currentLeft );
-}
-
-bool BamReader::LoadHeader(void) {
-
- // check to see if proper BAM header
- char buf[4];
- if (bam_read(m_file, buf, 4) != 4) { return false; }
- if (strncmp(buf, "BAM\001", 4)) {
- fprintf(stderr, "wrong header type!\n");
- return false;
- }
-
- // get BAM header text length
- int32_t headerTextLength;
- bam_read(m_file, &headerTextLength, 4);
-
- // get BAM header text
- char* headerText = (char*)calloc(headerTextLength + 1, 1);
- bam_read(m_file, headerText, headerTextLength);
- m_headerText = (string)((const char*)headerText);
-
- // clean up calloc-ed temp variable
- free(headerText);
-
- // get number of reference sequences
- int32_t numberRefSeqs;
- bam_read(m_file, &numberRefSeqs, 4);
- if (numberRefSeqs == 0) { return false; }
-
- m_references.reserve((int)numberRefSeqs);
-
- // reference variables
- int32_t refNameLength;
- char* refName;
- uint32_t refLength;
-
- // iterate over all references in header
- for (int i = 0; i != numberRefSeqs; ++i) {
-
- // get length of reference name
- bam_read(m_file, &refNameLength, 4);
- refName = (char*)calloc(refNameLength, 1);
-
- // get reference name and reference sequence length
- bam_read(m_file, refName, refNameLength);
- bam_read(m_file, &refLength, 4);
-
- // store data for reference
- RefData aReference;
- aReference.RefName = (string)((const char*)refName);
- aReference.RefLength = refLength;
- m_references.push_back(aReference);
-
- // clean up calloc-ed temp variable
- free(refName);
- }
-
- return true;
-}
-
-bool BamReader::LoadIndex(void) {
-
- // check to see if index file exists
- FILE* indexFile;
- if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
- fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
- return false;
- }
-
- // see if index is valid BAM index
- char magic[4];
- fread(magic, 1, 4, indexFile);
- if (strncmp(magic, "BAI\1", 4)) {
- fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
- fclose(indexFile);
- return false;
- }
-
- // get number of reference sequences
- uint32_t numRefSeqs;
- fread(&numRefSeqs, 4, 1, indexFile);
-
- // intialize BamIndex data structure
- m_index = new BamIndex;
- m_index->reserve(numRefSeqs);
-
- // iterate over reference sequences
- for (unsigned int i = 0; i < numRefSeqs; ++i) {
-
- // get number of bins for this reference sequence
- int32_t numBins;
- fread(&numBins, 4, 1, indexFile);
-
- if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
-
- // intialize BinVector
- BinVector* bins = new BinVector;
- bins->reserve(numBins);
-
- // iterate over bins for that reference sequence
- for (int j = 0; j < numBins; ++j) {
-
- // get binID
- uint32_t binID;
- fread(&binID, 4, 1, indexFile);
-
- // get number of regionChunks in this bin
- uint32_t numChunks;
- fread(&numChunks, 4, 1, indexFile);
-
- // intialize ChunkVector
- ChunkVector* regionChunks = new ChunkVector;
- regionChunks->reserve(numChunks);
-
- // iterate over regionChunks in this bin
- for (unsigned int k = 0; k < numChunks; ++k) {
-
- // get chunk boundaries (left, right)
- uint64_t left;
- uint64_t right;
- fread(&left, 8, 1, indexFile);
- fread(&right, 8, 1, indexFile);
-
- // save ChunkPair
- regionChunks->push_back( ChunkPair(left, right) );
- }
-
- // save binID, chunkVector for this bin
- bins->push_back( BamBin(binID, regionChunks) );
- }
-
- // load linear index for this reference sequence
-
- // get number of linear offsets
- int32_t numLinearOffsets;
- fread(&numLinearOffsets, 4, 1, indexFile);
-
- // intialize LinearOffsetVector
- LinearOffsetVector* linearOffsets = new LinearOffsetVector;
- linearOffsets->reserve(numLinearOffsets);
-
- // iterate over linear offsets for this reference sequeence
- for (int j = 0; j < numLinearOffsets; ++j) {
- // get a linear offset
- uint64_t linearOffset;
- fread(&linearOffset, 8, 1, indexFile);
- // store linear offset
- linearOffsets->push_back(linearOffset);
- }
-
- // store index data for that reference sequence
- m_index->push_back( new RefIndex(bins, linearOffsets) );
- }
-
- // close index file (.bai) and return
- fclose(indexFile);
- return true;
-}
-
-bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
-
- // check valid alignment block header data
- int32_t block_len;
- int32_t ret;
- uint32_t x[8];
-
- int32_t bytesRead = 0;
-
- // read in the 'block length' value, make sure it's not zero
- if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; }
- bytesRead += 4;
-
- // read in core alignment data, make the right size of data was read
- if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
- bytesRead += BAM_CORE_SIZE;
-
- // set BamAlignment 'core' data
- bAlignment.RefID = x[0];
- bAlignment.Position = x[1];
- bAlignment.Bin = x[2]>>16;
- bAlignment.MapQuality = x[2]>>8&0xff;
- bAlignment.AlignmentFlag = x[3]>>16;
- bAlignment.MateRefID = x[5];
- bAlignment.MatePosition = x[6];
- bAlignment.InsertSize = x[7];
-
- // fetch & store often-used lengths for character data parsing
- unsigned int queryNameLength = x[2]&0xff;
- unsigned int numCigarOperations = x[3]&0xffff;
- unsigned int querySequenceLength = x[4];
-
- // get length of character data
- int dataLength = block_len - BAM_CORE_SIZE;
-
- // set up destination buffers for character data
- uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
- uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength);
-
- const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
- const unsigned int tagDataLen = dataLength - tagDataOffset;
- char* tagData = ((char*)allCharData) + tagDataOffset;
-
- // get character data - make sure proper data size was read
- if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
- else {
-
- bytesRead += dataLength;
-
- // clear out bases, qualities, aligned bases, CIGAR, and tag data
- bAlignment.QueryBases.clear();
- bAlignment.Qualities.clear();
- bAlignment.AlignedBases.clear();
- bAlignment.CigarData.clear();
- bAlignment.TagData.clear();
-
- // save name
- bAlignment.Name = (string)((const char*)(allCharData));
-
- // save bases
- char singleBase[2];
- uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
- for (unsigned int i = 0; i < querySequenceLength; ++i) {
- // numeric to char conversion
- sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
- // append character data to Bases
- bAlignment.QueryBases.append( (const char*)singleBase );
- }
-
- // save sequence length
- bAlignment.Length = bAlignment.QueryBases.length();
-
- // save qualities
- char singleQuality[4];
- uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
- for (unsigned int i = 0; i < querySequenceLength; ++i) {
- // numeric to char conversion
- sprintf( singleQuality, "%c", ( t[i]+33 ) );
- // append character data to Qualities
- bAlignment.Qualities.append( (const char*)singleQuality );
- }
-
- // save CIGAR-related data;
- int k = 0;
- for (unsigned int i = 0; i < numCigarOperations; ++i) {
-
- // build CigarOp struct
- CigarOp op;
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
-
- // save CigarOp
- bAlignment.CigarData.push_back(op);
-
- // can skip this step if user wants to ignore
- if (m_isCalculateAlignedBases) {
-
- // build AlignedBases string
- switch (op.Type) {
-
- case ('M') :
- case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
- case ('S') : k += op.Length; // for 'S' - skip over query bases
- break;
-
- case ('D') :
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
- break;
-
- case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
- k += op.Length;
- break;
-
- case ('H') : break; // for 'H' - do nothing, move to next op
-
- default : assert(false); // shouldn't get here
- break;
- }
- }
- }
-
- // read in the tag data
- bAlignment.TagData.resize(tagDataLen);
- memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
- }
- free(allCharData);
-
-/*
- // (optional) read tag parsing data
- string tag;
- char data;
- int i = 0;
-
- // still data to read (auxiliary tags)
- while ( bytesRead < block_len ) {
-
- if ( bam_read(m_file, &data, 1) == 1 ) {
-
- ++bytesRead;
-
- if (bytesRead == block_len && data != '\0') {
- fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
- exit(1);
- }
-
- tag.append(1, data);
- if ( data == '\0' ) {
- bAlignment.Tags.push_back(tag);
- tag = "";
- i = 0;
- } else {
- if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
- ++i;
- }
- } else {
- fprintf(stderr, "ERROR: Could not read tag info.\n");
- exit(1);
- }
- }
-*/
- return true;
-}
+// ***************************************************************************\r
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 June 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading BAM files\r
+// ***************************************************************************\r
+\r
+// C++ includes\r
+#include <algorithm>\r
+#include <iterator>\r
+#include <string>\r
+#include <vector>\r
+\r
+// BamTools includes\r
+#include "BGZF.h"\r
+#include "BamReader.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamReader::BamReaderPrivate {\r
+\r
+ // -------------------------------\r
+ // data members\r
+ // -------------------------------\r
+\r
+ // general file data\r
+ BgzfData mBGZF;\r
+ string HeaderText;\r
+ BamIndex Index;\r
+ RefVector References;\r
+ bool IsIndexLoaded;\r
+ int64_t AlignmentsBeginOffset;\r
+ string Filename;\r
+ string IndexFilename;\r
+ \r
+ // system data\r
+ bool IsBigEndian;\r
+\r
+ // user-specified region values\r
+ bool IsRegionSpecified;\r
+ int CurrentRefID;\r
+ int CurrentLeft;\r
+\r
+ // BAM character constants\r
+ const char* DNA_LOOKUP;\r
+ const char* CIGAR_LOOKUP;\r
+\r
+ // -------------------------------\r
+ // constructor & destructor\r
+ // -------------------------------\r
+ BamReaderPrivate(void);\r
+ ~BamReaderPrivate(void);\r
+\r
+ // -------------------------------\r
+ // "public" interface\r
+ // -------------------------------\r
+\r
+ // file operations\r
+ void Close(void);\r
+ bool Jump(int refID, int position = 0);\r
+ void Open(const string& filename, const string& indexFilename = "");\r
+ bool Rewind(void);\r
+\r
+ // access alignment data\r
+ bool GetNextAlignment(BamAlignment& bAlignment);\r
+ bool GetNextAlignmentCore(BamAlignment& bAlignment, BamAlignmentSupportData& supportData);\r
+\r
+ // access auxiliary data\r
+ int GetReferenceID(const string& refName) const;\r
+\r
+ // index operations\r
+ bool CreateIndex(void);\r
+\r
+ // -------------------------------\r
+ // internal methods\r
+ // -------------------------------\r
+\r
+ // *** reading alignments and auxiliary data *** //\r
+\r
+ // calculate bins that overlap region ( left to reference end for now )\r
+ int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]);\r
+ // fills out character data for BamAlignment data\r
+ bool BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData);\r
+ // calculate file offset for first alignment chunk overlapping 'left'\r
+ int64_t GetOffset(int refID, int left);\r
+ // checks to see if alignment overlaps current region\r
+ bool IsOverlap(BamAlignment& bAlignment);\r
+ // retrieves header text from BAM file\r
+ void LoadHeaderData(void);\r
+ // retrieves BAM alignment under file pointer\r
+ bool LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData);\r
+ // builds reference data structure from BAM file\r
+ void LoadReferenceData(void);\r
+\r
+ // *** index file handling *** //\r
+\r
+ // calculates index for BAM file\r
+ bool BuildIndex(void);\r
+ // clear out inernal index data structure\r
+ void ClearIndex(void);\r
+ // saves BAM bin entry for index\r
+ void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);\r
+ // saves linear offset entry for index\r
+ void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);\r
+ // loads index from BAM index file\r
+ bool LoadIndex(void);\r
+ // simplifies index by merging 'chunks'\r
+ void MergeChunks(void);\r
+ // saves index to BAM index file\r
+ bool WriteIndex(void);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamReader implementation (wrapper around BRPrivate)\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamReader::BamReader(void) {\r
+ d = new BamReaderPrivate;\r
+}\r
+\r
+// destructor\r
+BamReader::~BamReader(void) {\r
+ delete d;\r
+ d = 0;\r
+}\r
+\r
+// file operations\r
+void BamReader::Close(void) { d->Close(); }\r
+bool BamReader::Jump(int refID, int position) { return d->Jump(refID, position); }\r
+void BamReader::Open(const string& filename, const string& indexFilename) { d->Open(filename, indexFilename); }\r
+bool BamReader::Rewind(void) { return d->Rewind(); }\r
+\r
+// access alignment data\r
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) { return d->GetNextAlignmentCore(bAlignment, supportData); }\r
+\r
+// access auxiliary data\r
+const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
+const RefVector BamReader::GetReferenceData(void) const { return d->References; }\r
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
+\r
+// index operations\r
+bool BamReader::CreateIndex(void) { return d->CreateIndex(); }\r
+\r
+// -----------------------------------------------------\r
+// BamReaderPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamReader::BamReaderPrivate::BamReaderPrivate(void)\r
+ : IsIndexLoaded(false)\r
+ , AlignmentsBeginOffset(0)\r
+ , IsRegionSpecified(false)\r
+ , CurrentRefID(0)\r
+ , CurrentLeft(0)\r
+ , DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
+ , CIGAR_LOOKUP("MIDNSHP")\r
+{ \r
+ IsBigEndian = SystemIsBigEndian();\r
+}\r
+\r
+// destructor\r
+BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
+ Close();\r
+}\r
+\r
+// calculate bins that overlap region ( left to reference end for now )\r
+int BamReader::BamReaderPrivate::BinsFromRegion(int refID, int left, uint16_t list[MAX_BIN]) {\r
+\r
+ // get region boundaries\r
+ uint32_t begin = (unsigned int)left;\r
+ uint32_t end = (unsigned int)References.at(refID).RefLength - 1;\r
+\r
+ // initialize list, bin '0' always a valid bin\r
+ int i = 0;\r
+ list[i++] = 0;\r
+\r
+ // get rest of bins that contain this region\r
+ unsigned int k;\r
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }\r
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }\r
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }\r
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }\r
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
+\r
+ // return number of bins stored\r
+ return i;\r
+}\r
+\r
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData) {\r
+ \r
+ // calculate character lengths/offsets\r
+ const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int seqDataOffset = supportData.QueryNameLength + (supportData.NumCigarOperations * 4);\r
+ const unsigned int qualDataOffset = seqDataOffset + (supportData.QuerySequenceLength+1)/2;\r
+ const unsigned int tagDataOffset = qualDataOffset + supportData.QuerySequenceLength;\r
+ const unsigned int tagDataLength = dataLength - tagDataOffset;\r
+ \r
+ // set up char buffers\r
+ const char* allCharData = supportData.AllCharData.data();\r
+ const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
+ const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
+ char* tagData = ((char*)allCharData) + tagDataOffset;\r
+ \r
+ // save query sequence\r
+ bAlignment.QueryBases.clear();\r
+ bAlignment.QueryBases.reserve(supportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
+ bAlignment.QueryBases.append(1, singleBase);\r
+ }\r
+ \r
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
+ bAlignment.Qualities.clear();\r
+ bAlignment.Qualities.reserve(supportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+ char singleQuality = (char)(qualData[i]+33);\r
+ bAlignment.Qualities.append(1, singleQuality);\r
+ }\r
+ \r
+ // parse CIGAR to build 'AlignedBases'\r
+ bAlignment.AlignedBases.clear();\r
+ bAlignment.AlignedBases.reserve(supportData.QuerySequenceLength);\r
+ \r
+ int k = 0;\r
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
+ \r
+ const CigarOp& op = (*cigarIter);\r
+ switch(op.Type) {\r
+ \r
+ case ('M') :\r
+ case ('I') :\r
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases\r
+ // fall through\r
+ \r
+ case ('S') :\r
+ k += op.Length; // for 'S' - soft clip, skip over query bases\r
+ break;\r
+ \r
+ case ('D') :\r
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character\r
+ break;\r
+ \r
+ case ('P') :\r
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character\r
+ break;\r
+ \r
+ case ('N') :\r
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence\r
+ // k+=op.Length; \r
+ break;\r
+ \r
+ case ('H') :\r
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
+ \r
+ default:\r
+ printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
+ \r
+ // -----------------------\r
+ // Added: 3-25-2010 DWB\r
+ // Fixed: endian-correctness for tag data\r
+ // -----------------------\r
+ if ( IsBigEndian ) {\r
+ int i = 0;\r
+ while ( (unsigned int)i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+\r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i+=2; // sizeof(uint16_t)\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i+=4; // sizeof(uint32_t)\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i+=8; // sizeof(uint64_t) \r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
+ }\r
+ \r
+ // store TagData\r
+ bAlignment.TagData.clear();\r
+ bAlignment.TagData.resize(tagDataLength);\r
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+// populates BAM index data structure from BAM file data\r
+bool BamReader::BamReaderPrivate::BuildIndex(void) {\r
+\r
+ // check to be sure file is open\r
+ if (!mBGZF.IsOpen) { return false; }\r
+\r
+ // move file pointer to beginning of alignments\r
+ Rewind();\r
+\r
+ // get reference count, reserve index space\r
+ int numReferences = References.size();\r
+ for ( int i = 0; i < numReferences; ++i ) {\r
+ Index.push_back(ReferenceIndex());\r
+ }\r
+\r
+ // sets default constant for bin, ID, offset, coordinate variables\r
+ const uint32_t defaultValue = 0xffffffffu;\r
+\r
+ // bin data\r
+ uint32_t saveBin(defaultValue);\r
+ uint32_t lastBin(defaultValue);\r
+\r
+ // reference ID data\r
+ int32_t saveRefID(defaultValue);\r
+ int32_t lastRefID(defaultValue);\r
+\r
+ // offset data\r
+ uint64_t saveOffset = mBGZF.Tell();\r
+ uint64_t lastOffset = saveOffset;\r
+\r
+ // coordinate data\r
+ int32_t lastCoordinate = defaultValue;\r
+\r
+ BamAlignment bAlignment;\r
+ while( GetNextAlignment(bAlignment) ) {\r
+\r
+ // change of chromosome, save ID, reset bin\r
+ if ( lastRefID != bAlignment.RefID ) {\r
+ lastRefID = bAlignment.RefID;\r
+ lastBin = defaultValue;\r
+ }\r
+\r
+ // if lastCoordinate greater than BAM position - file not sorted properly\r
+ else if ( lastCoordinate > bAlignment.Position ) {\r
+ printf("BAM file not properly sorted:\n");\r
+ printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);\r
+ exit(1);\r
+ }\r
+\r
+ // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)\r
+ if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {\r
+\r
+ // save linear offset entry (matched to BAM entry refID)\r
+ ReferenceIndex& refIndex = Index.at(bAlignment.RefID);\r
+ LinearOffsetVector& offsets = refIndex.Offsets;\r
+ InsertLinearOffset(offsets, bAlignment, lastOffset);\r
+ }\r
+\r
+ // if current BamAlignment bin != lastBin, "then possibly write the binning index"\r
+ if ( bAlignment.Bin != lastBin ) {\r
+\r
+ // if not first time through\r
+ if ( saveBin != defaultValue ) {\r
+\r
+ // save Bam bin entry\r
+ ReferenceIndex& refIndex = Index.at(saveRefID);\r
+ BamBinMap& binMap = refIndex.Bins;\r
+ InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
+ }\r
+\r
+ // update saveOffset\r
+ saveOffset = lastOffset;\r
+\r
+ // update bin values\r
+ saveBin = bAlignment.Bin;\r
+ lastBin = bAlignment.Bin;\r
+\r
+ // update saveRefID\r
+ saveRefID = bAlignment.RefID;\r
+\r
+ // if invalid RefID, break out (why?)\r
+ if ( saveRefID < 0 ) { break; }\r
+ }\r
+\r
+ // make sure that current file pointer is beyond lastOffset\r
+ if ( mBGZF.Tell() <= (int64_t)lastOffset ) {\r
+ printf("Error in BGZF offsets.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // update lastOffset\r
+ lastOffset = mBGZF.Tell();\r
+\r
+ // update lastCoordinate\r
+ lastCoordinate = bAlignment.Position;\r
+ }\r
+\r
+ // save any leftover BAM data (as long as refID is valid)\r
+ if ( saveRefID >= 0 ) {\r
+ // save Bam bin entry\r
+ ReferenceIndex& refIndex = Index.at(saveRefID);\r
+ BamBinMap& binMap = refIndex.Bins;\r
+ InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
+ }\r
+\r
+ // simplify index by merging chunks\r
+ MergeChunks();\r
+\r
+ // iterate over references\r
+ BamIndex::iterator indexIter = Index.begin();\r
+ BamIndex::iterator indexEnd = Index.end();\r
+ for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {\r
+\r
+ // get reference index data\r
+ ReferenceIndex& refIndex = (*indexIter);\r
+ BamBinMap& binMap = refIndex.Bins;\r
+ LinearOffsetVector& offsets = refIndex.Offsets;\r
+\r
+ // store whether reference has alignments or no\r
+ References[i].RefHasAlignments = ( binMap.size() > 0 );\r
+\r
+ // sort linear offsets\r
+ sort(offsets.begin(), offsets.end());\r
+ }\r
+\r
+\r
+ // rewind file pointer to beginning of alignments, return success/fail\r
+ return Rewind();\r
+}\r
+\r
+\r
+// clear index data structure\r
+void BamReader::BamReaderPrivate::ClearIndex(void) {\r
+ Index.clear(); // sufficient ??\r
+}\r
+\r
+// closes the BAM file\r
+void BamReader::BamReaderPrivate::Close(void) {\r
+ mBGZF.Close();\r
+ ClearIndex();\r
+ HeaderText.clear();\r
+ IsRegionSpecified = false;\r
+}\r
+\r
+// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
+bool BamReader::BamReaderPrivate::CreateIndex(void) {\r
+\r
+ // clear out index\r
+ ClearIndex();\r
+\r
+ // build (& save) index from BAM file\r
+ bool ok = true;\r
+ ok &= BuildIndex();\r
+ ok &= WriteIndex();\r
+\r
+ // return success/fail\r
+ return ok;\r
+}\r
+\r
+// returns RefID for given RefName (returns References.size() if not found)\r
+int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
+\r
+ // retrieve names from reference data\r
+ vector<string> refNames;\r
+ RefVector::const_iterator refIter = References.begin();\r
+ RefVector::const_iterator refEnd = References.end();\r
+ for ( ; refIter != refEnd; ++refIter) {\r
+ refNames.push_back( (*refIter).RefName );\r
+ }\r
+\r
+ // return 'index-of' refName ( if not found, returns refNames.size() )\r
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
+}\r
+\r
+// get next alignment (from specified region, if given)\r
+bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
+\r
+ BamAlignmentSupportData supportData;\r
+ \r
+ // if valid alignment available\r
+ if ( LoadNextAlignment(bAlignment, supportData) ) {\r
+\r
+ // if region not specified, return success\r
+ if ( !IsRegionSpecified ) { \r
+ bool ok = BuildCharData(bAlignment, supportData);\r
+ return ok; \r
+ }\r
+\r
+ // load next alignment until region overlap is found\r
+ while ( !IsOverlap(bAlignment) ) {\r
+ // if no valid alignment available (likely EOF) return failure\r
+ if ( !LoadNextAlignment(bAlignment, supportData) ) return false;\r
+ }\r
+\r
+ // return success (alignment found that overlaps region)\r
+ bool ok = BuildCharData(bAlignment, supportData);\r
+ return ok;\r
+ }\r
+\r
+ // no valid alignment\r
+ else \r
+ return false;\r
+}\r
+\r
+// retrieves next available alignment core data (returns success/fail)\r
+// ** DOES NOT parse any character data (bases, qualities, tag data)\r
+// these can be accessed, if necessary, from the supportData \r
+// useful for operations requiring ONLY positional or other alignment-related information\r
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) {\r
+\r
+ // if valid alignment available\r
+ if ( LoadNextAlignment(bAlignment, supportData) ) {\r
+\r
+ // if region not specified, return success\r
+ if ( !IsRegionSpecified ) return true;\r
+\r
+ // load next alignment until region overlap is found\r
+ while ( !IsOverlap(bAlignment) ) {\r
+ // if no valid alignment available (likely EOF) return failure\r
+ if ( !LoadNextAlignment(bAlignment, supportData) ) return false;\r
+ }\r
+\r
+ // return success (alignment found that overlaps region)\r
+ return true;\r
+ }\r
+\r
+ // no valid alignment\r
+ else\r
+ return false;\r
+}\r
+\r
+// calculate closest indexed file offset for region specified\r
+int64_t BamReader::BamReaderPrivate::GetOffset(int refID, int left) {\r
+\r
+ // calculate which bins overlap this region\r
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);\r
+ int numBins = BinsFromRegion(refID, left, bins);\r
+\r
+ // get bins for this reference\r
+ const ReferenceIndex& refIndex = Index.at(refID);\r
+ const BamBinMap& binMap = refIndex.Bins;\r
+\r
+ // get minimum offset to consider\r
+ const LinearOffsetVector& offsets = refIndex.Offsets;\r
+ uint64_t minOffset = ( (unsigned int)(left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT);\r
+\r
+ // store offsets to beginning of alignment 'chunks'\r
+ std::vector<int64_t> chunkStarts;\r
+\r
+ // store all alignment 'chunk' starts for bins in this region\r
+ for (int i = 0; i < numBins; ++i ) {\r
+ uint16_t binKey = bins[i];\r
+\r
+ map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);\r
+ if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {\r
+\r
+ const ChunkVector& chunks = (*binIter).second;\r
+ std::vector<Chunk>::const_iterator chunksIter = chunks.begin();\r
+ std::vector<Chunk>::const_iterator chunksEnd = chunks.end();\r
+ for ( ; chunksIter != chunksEnd; ++chunksIter) {\r
+ const Chunk& chunk = (*chunksIter);\r
+ if ( chunk.Stop > minOffset ) {\r
+ chunkStarts.push_back( chunk.Start );\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ // clean up memory\r
+ free(bins);\r
+\r
+ // if no alignments found, else return smallest offset for alignment starts\r
+ if ( chunkStarts.size() == 0 ) { return -1; }\r
+ else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }\r
+}\r
+\r
+// saves BAM bin entry for index\r
+void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap,\r
+ const uint32_t& saveBin,\r
+ const uint64_t& saveOffset,\r
+ const uint64_t& lastOffset)\r
+{\r
+ // look up saveBin\r
+ BamBinMap::iterator binIter = binMap.find(saveBin);\r
+\r
+ // create new chunk\r
+ Chunk newChunk(saveOffset, lastOffset);\r
+\r
+ // if entry doesn't exist\r
+ if ( binIter == binMap.end() ) {\r
+ ChunkVector newChunks;\r
+ newChunks.push_back(newChunk);\r
+ binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));\r
+ }\r
+\r
+ // otherwise\r
+ else {\r
+ ChunkVector& binChunks = (*binIter).second;\r
+ binChunks.push_back( newChunk );\r
+ }\r
+}\r
+\r
+// saves linear offset entry for index\r
+void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets,\r
+ const BamAlignment& bAlignment,\r
+ const uint64_t& lastOffset)\r
+{\r
+ // get converted offsets\r
+ int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;\r
+ int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;\r
+\r
+ // resize vector if necessary\r
+ int oldSize = offsets.size();\r
+ int newSize = endOffset + 1;\r
+ if ( oldSize < newSize ) { offsets.resize(newSize, 0); }\r
+\r
+ // store offset\r
+ for(int i = beginOffset + 1; i <= endOffset ; ++i) {\r
+ if ( offsets[i] == 0) {\r
+ offsets[i] = lastOffset;\r
+ }\r
+ }\r
+}\r
+\r
+// returns whether alignment overlaps currently specified region (refID, leftBound)\r
+bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
+\r
+ // if on different reference sequence, quit\r
+ if ( bAlignment.RefID != CurrentRefID ) { return false; }\r
+\r
+ // read starts after left boundary\r
+ if ( bAlignment.Position >= CurrentLeft) { return true; }\r
+\r
+ // return whether alignment end overlaps left boundary\r
+ return ( bAlignment.GetEndPosition() >= CurrentLeft );\r
+}\r
+\r
+// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
+bool BamReader::BamReaderPrivate::Jump(int refID, int position) {\r
+\r
+ // if data exists for this reference and position is valid \r
+ if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {\r
+\r
+ // set current region\r
+ CurrentRefID = refID;\r
+ CurrentLeft = position;\r
+ IsRegionSpecified = true;\r
+\r
+ // calculate offset\r
+ int64_t offset = GetOffset(CurrentRefID, CurrentLeft);\r
+\r
+ // if in valid offset, return failure\r
+ if ( offset == -1 ) { return false; }\r
+\r
+ // otherwise return success of seek operation\r
+ else { return mBGZF.Seek(offset); }\r
+ }\r
+\r
+ // invalid jump request parameters, return failure\r
+ return false;\r
+}\r
+\r
+// load BAM header data\r
+void BamReader::BamReaderPrivate::LoadHeaderData(void) {\r
+\r
+ // check to see if proper BAM header\r
+ char buffer[4];\r
+ if (mBGZF.Read(buffer, 4) != 4) {\r
+ printf("Could not read header type\n");\r
+ exit(1);\r
+ }\r
+\r
+ if (strncmp(buffer, "BAM\001", 4)) {\r
+ printf("wrong header type!\n");\r
+ exit(1);\r
+ }\r
+\r
+ // get BAM header text length\r
+ mBGZF.Read(buffer, 4);\r
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+ \r
+ // get BAM header text\r
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
+ mBGZF.Read(headerText, headerTextLength);\r
+ HeaderText = (string)((const char*)headerText);\r
+\r
+ // clean up calloc-ed temp variable\r
+ free(headerText);\r
+}\r
+\r
+// load existing index data from BAM index file (".bai"), return success/fail\r
+bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
+\r
+ // clear out index data\r
+ ClearIndex();\r
+\r
+ // skip if index file empty\r
+ if ( IndexFilename.empty() ) { return false; }\r
+\r
+ // open index file, abort on error\r
+ FILE* indexStream = fopen(IndexFilename.c_str(), "rb");\r
+ if(!indexStream) {\r
+ printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() );\r
+ return false;\r
+ }\r
+\r
+ size_t elementsRead = 0;\r
+ \r
+ // see if index is valid BAM index\r
+ char magic[4];\r
+ elementsRead = fread(magic, 1, 4, indexStream);\r
+ if (strncmp(magic, "BAI\1", 4)) {\r
+ printf("Problem with index file - invalid format.\n");\r
+ fclose(indexStream);\r
+ return false;\r
+ }\r
+\r
+ // get number of reference sequences\r
+ uint32_t numRefSeqs;\r
+ elementsRead = fread(&numRefSeqs, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); }\r
+ \r
+ // intialize space for BamIndex data structure\r
+ Index.reserve(numRefSeqs);\r
+\r
+ // iterate over reference sequences\r
+ for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
+\r
+ // get number of bins for this reference sequence\r
+ int32_t numBins;\r
+ elementsRead = fread(&numBins, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numBins); }\r
+\r
+ if (numBins > 0) {\r
+ RefData& refEntry = References[i];\r
+ refEntry.RefHasAlignments = true;\r
+ }\r
+\r
+ // intialize BinVector\r
+ BamBinMap binMap;\r
+\r
+ // iterate over bins for that reference sequence\r
+ for (int j = 0; j < numBins; ++j) {\r
+\r
+ // get binID\r
+ uint32_t binID;\r
+ elementsRead = fread(&binID, 4, 1, indexStream);\r
+\r
+ // get number of regionChunks in this bin\r
+ uint32_t numChunks;\r
+ elementsRead = fread(&numChunks, 4, 1, indexStream);\r
+\r
+ if ( IsBigEndian ) { \r
+ SwapEndian_32(binID);\r
+ SwapEndian_32(numChunks);\r
+ }\r
+ \r
+ // intialize ChunkVector\r
+ ChunkVector regionChunks;\r
+ regionChunks.reserve(numChunks);\r
+\r
+ // iterate over regionChunks in this bin\r
+ for (unsigned int k = 0; k < numChunks; ++k) {\r
+\r
+ // get chunk boundaries (left, right)\r
+ uint64_t left;\r
+ uint64_t right;\r
+ elementsRead = fread(&left, 8, 1, indexStream);\r
+ elementsRead = fread(&right, 8, 1, indexStream);\r
+\r
+ if ( IsBigEndian ) {\r
+ SwapEndian_64(left);\r
+ SwapEndian_64(right);\r
+ }\r
+ \r
+ // save ChunkPair\r
+ regionChunks.push_back( Chunk(left, right) );\r
+ }\r
+\r
+ // sort chunks for this bin\r
+ sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );\r
+\r
+ // save binID, chunkVector for this bin\r
+ binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );\r
+ }\r
+\r
+ // load linear index for this reference sequence\r
+\r
+ // get number of linear offsets\r
+ int32_t numLinearOffsets;\r
+ elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); }\r
+\r
+ // intialize LinearOffsetVector\r
+ LinearOffsetVector offsets;\r
+ offsets.reserve(numLinearOffsets);\r
+\r
+ // iterate over linear offsets for this reference sequeence\r
+ uint64_t linearOffset;\r
+ for (int j = 0; j < numLinearOffsets; ++j) {\r
+ // read a linear offset & store\r
+ elementsRead = fread(&linearOffset, 8, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
+ offsets.push_back(linearOffset);\r
+ }\r
+\r
+ // sort linear offsets\r
+ sort( offsets.begin(), offsets.end() );\r
+\r
+ // store index data for that reference sequence\r
+ Index.push_back( ReferenceIndex(binMap, offsets) );\r
+ }\r
+\r
+ // close index file (.bai) and return\r
+ fclose(indexStream);\r
+ return true;\r
+}\r
+\r
+// populates BamAlignment with alignment data under file pointer, returns success/fail\r
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) {\r
+\r
+ // read in the 'block length' value, make sure it's not zero\r
+ char buffer[4];\r
+ mBGZF.Read(buffer, 4);\r
+ supportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(supportData.BlockLength); }\r
+ if ( supportData.BlockLength == 0 ) { return false; }\r
+\r
+ // read in core alignment data, make sure the right size of data was read\r
+ char x[BAM_CORE_SIZE];\r
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+\r
+ if ( IsBigEndian ) {\r
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
+ SwapEndian_32p(&x[i]); \r
+ }\r
+ }\r
+ \r
+ // set BamAlignment 'core' and 'support' data\r
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); \r
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
+ \r
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
+ bAlignment.Bin = tempValue >> 16;\r
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
+ supportData.QueryNameLength = tempValue & 0xff;\r
+\r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
+ bAlignment.AlignmentFlag = tempValue >> 16;\r
+ supportData.NumCigarOperations = tempValue & 0xffff;\r
+\r
+ supportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
+ \r
+ // store 'all char data' and cigar ops\r
+ const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int cigarDataOffset = supportData.QueryNameLength;\r
+ \r
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ \r
+ // read in character data - make sure proper data size was read\r
+ if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; }\r
+ else {\r
+ \r
+ // store alignment name and length\r
+ bAlignment.Name.assign((const char*)(allCharData));\r
+ bAlignment.Length = supportData.QuerySequenceLength;\r
+ \r
+ // store remaining 'allCharData' in supportData structure\r
+ supportData.AllCharData.assign((const char*)allCharData, dataLength);\r
+ \r
+ // save CigarOps for BamAlignment\r
+ bAlignment.CigarData.clear();\r
+ for (unsigned int i = 0; i < supportData.NumCigarOperations; ++i) {\r
+\r
+ // swap if necessary\r
+ if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+ \r
+ // build CigarOp structure\r
+ CigarOp op;\r
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+ // save CigarOp\r
+ bAlignment.CigarData.push_back(op);\r
+ }\r
+ }\r
+\r
+ free(allCharData);\r
+ return true;\r
+}\r
+\r
+// loads reference data from BAM file\r
+void BamReader::BamReaderPrivate::LoadReferenceData(void) {\r
+\r
+ // get number of reference sequences\r
+ char buffer[4];\r
+ mBGZF.Read(buffer, 4);\r
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
+ if (numberRefSeqs == 0) { return; }\r
+ References.reserve((int)numberRefSeqs);\r
+\r
+ // iterate over all references in header\r
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {\r
+\r
+ // get length of reference name\r
+ mBGZF.Read(buffer, 4);\r
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
+ char* refName = (char*)calloc(refNameLength, 1);\r
+\r
+ // get reference name and reference sequence length\r
+ mBGZF.Read(refName, refNameLength);\r
+ mBGZF.Read(buffer, 4);\r
+ int refLength = BgzfData::UnpackSignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
+\r
+ // store data for reference\r
+ RefData aReference;\r
+ aReference.RefName = (string)((const char*)refName);\r
+ aReference.RefLength = refLength;\r
+ References.push_back(aReference);\r
+\r
+ // clean up calloc-ed temp variable\r
+ free(refName);\r
+ }\r
+}\r
+\r
+// merges 'alignment chunks' in BAM bin (used for index building)\r
+void BamReader::BamReaderPrivate::MergeChunks(void) {\r
+\r
+ // iterate over reference enties\r
+ BamIndex::iterator indexIter = Index.begin();\r
+ BamIndex::iterator indexEnd = Index.end();\r
+ for ( ; indexIter != indexEnd; ++indexIter ) {\r
+\r
+ // get BAM bin map for this reference\r
+ ReferenceIndex& refIndex = (*indexIter);\r
+ BamBinMap& bamBinMap = refIndex.Bins;\r
+\r
+ // iterate over BAM bins\r
+ BamBinMap::iterator binIter = bamBinMap.begin();\r
+ BamBinMap::iterator binEnd = bamBinMap.end();\r
+ for ( ; binIter != binEnd; ++binIter ) {\r
+\r
+ // get chunk vector for this bin\r
+ ChunkVector& binChunks = (*binIter).second;\r
+ if ( binChunks.size() == 0 ) { continue; }\r
+\r
+ ChunkVector mergedChunks;\r
+ mergedChunks.push_back( binChunks[0] );\r
+\r
+ // iterate over chunks\r
+ int i = 0;\r
+ ChunkVector::iterator chunkIter = binChunks.begin();\r
+ ChunkVector::iterator chunkEnd = binChunks.end();\r
+ for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {\r
+\r
+ // get 'currentChunk' based on numeric index\r
+ Chunk& currentChunk = mergedChunks[i];\r
+\r
+ // get iteratorChunk based on vector iterator\r
+ Chunk& iteratorChunk = (*chunkIter);\r
+\r
+ // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)\r
+ if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {\r
+\r
+ // set currentChunk.Stop to iteratorChunk.Stop\r
+ currentChunk.Stop = iteratorChunk.Stop;\r
+ }\r
+\r
+ // otherwise\r
+ else {\r
+ // set currentChunk + 1 to iteratorChunk\r
+ mergedChunks.push_back(iteratorChunk);\r
+ ++i;\r
+ }\r
+ }\r
+\r
+ // saved merged chunk vector\r
+ (*binIter).second = mergedChunks;\r
+ }\r
+ }\r
+}\r
+\r
+// opens BAM file (and index)\r
+void BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
+\r
+ Filename = filename;\r
+ IndexFilename = indexFilename;\r
+\r
+ // open the BGZF file for reading, retrieve header text & reference data\r
+ mBGZF.Open(filename, "rb");\r
+ LoadHeaderData();\r
+ LoadReferenceData();\r
+\r
+ // store file offset of first alignment\r
+ AlignmentsBeginOffset = mBGZF.Tell();\r
+\r
+ // open index file & load index data (if exists)\r
+ if ( !IndexFilename.empty() ) {\r
+ LoadIndex();\r
+ }\r
+}\r
+\r
+// returns BAM file pointer to beginning of alignment data\r
+bool BamReader::BamReaderPrivate::Rewind(void) {\r
+\r
+ // find first reference that has alignments in the BAM file\r
+ int refID = 0;\r
+ int refCount = References.size();\r
+ for ( ; refID < refCount; ++refID ) {\r
+ if ( References.at(refID).RefHasAlignments ) { break; }\r
+ }\r
+\r
+ // store default bounds for first alignment\r
+ CurrentRefID = refID;\r
+ CurrentLeft = 0;\r
+ IsRegionSpecified = false;\r
+\r
+ // return success/failure of seek\r
+ return mBGZF.Seek(AlignmentsBeginOffset);\r
+}\r
+\r
+// saves index data to BAM index file (".bai"), returns success/fail\r
+bool BamReader::BamReaderPrivate::WriteIndex(void) {\r
+\r
+ IndexFilename = Filename + ".bai";\r
+ FILE* indexStream = fopen(IndexFilename.c_str(), "wb");\r
+ if ( indexStream == 0 ) {\r
+ printf("ERROR: Could not open file to save index\n");\r
+ return false;\r
+ }\r
+\r
+ // write BAM index header\r
+ fwrite("BAI\1", 1, 4, indexStream);\r
+\r
+ // write number of reference sequences\r
+ int32_t numReferenceSeqs = Index.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); }\r
+ fwrite(&numReferenceSeqs, 4, 1, indexStream);\r
+\r
+ // iterate over reference sequences\r
+ BamIndex::const_iterator indexIter = Index.begin();\r
+ BamIndex::const_iterator indexEnd = Index.end();\r
+ for ( ; indexIter != indexEnd; ++ indexIter ) {\r
+\r
+ // get reference index data\r
+ const ReferenceIndex& refIndex = (*indexIter);\r
+ const BamBinMap& binMap = refIndex.Bins;\r
+ const LinearOffsetVector& offsets = refIndex.Offsets;\r
+\r
+ // write number of bins\r
+ int32_t binCount = binMap.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(binCount); }\r
+ fwrite(&binCount, 4, 1, indexStream);\r
+\r
+ // iterate over bins\r
+ BamBinMap::const_iterator binIter = binMap.begin();\r
+ BamBinMap::const_iterator binEnd = binMap.end();\r
+ for ( ; binIter != binEnd; ++binIter ) {\r
+\r
+ // get bin data (key and chunk vector)\r
+ uint32_t binKey = (*binIter).first;\r
+ const ChunkVector& binChunks = (*binIter).second;\r
+\r
+ // save BAM bin key\r
+ if ( IsBigEndian ) { SwapEndian_32(binKey); }\r
+ fwrite(&binKey, 4, 1, indexStream);\r
+\r
+ // save chunk count\r
+ int32_t chunkCount = binChunks.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(chunkCount); }\r
+ fwrite(&chunkCount, 4, 1, indexStream);\r
+\r
+ // iterate over chunks\r
+ ChunkVector::const_iterator chunkIter = binChunks.begin();\r
+ ChunkVector::const_iterator chunkEnd = binChunks.end();\r
+ for ( ; chunkIter != chunkEnd; ++chunkIter ) {\r
+\r
+ // get current chunk data\r
+ const Chunk& chunk = (*chunkIter);\r
+ uint64_t start = chunk.Start;\r
+ uint64_t stop = chunk.Stop;\r
+\r
+ if ( IsBigEndian ) {\r
+ SwapEndian_64(start);\r
+ SwapEndian_64(stop);\r
+ }\r
+ \r
+ // save chunk offsets\r
+ fwrite(&start, 8, 1, indexStream);\r
+ fwrite(&stop, 8, 1, indexStream);\r
+ }\r
+ }\r
+\r
+ // write linear offsets size\r
+ int32_t offsetSize = offsets.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(offsetSize); }\r
+ fwrite(&offsetSize, 4, 1, indexStream);\r
+\r
+ // iterate over linear offsets\r
+ LinearOffsetVector::const_iterator offsetIter = offsets.begin();\r
+ LinearOffsetVector::const_iterator offsetEnd = offsets.end();\r
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {\r
+\r
+ // write linear offset value\r
+ uint64_t linearOffset = (*offsetIter);\r
+ if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
+ fwrite(&linearOffset, 8, 1, indexStream);\r
+ }\r
+ }\r
+\r
+ // flush buffer, close file, and return success\r
+ fflush(indexStream);\r
+ fclose(indexStream);\r
+ return true;\r
+}\r