// open our BAM reader
BamReader reader;
- reader.SetFilename(inputFilename);
-
- if(!reader.Open()) {
- cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl;
- exit(1);
- }
+ reader.Open(inputFilename);
// retrieve the SAM header text
string samHeader = reader.GetHeaderText();
writer.Close();
return 0;
-}
\ No newline at end of file
+}
-// BamReader.cpp
-
-// Derek Barnett
-// Marth Lab, Boston College
-// Last modified: 23 April 2009
+// ***************************************************************************
+// BamReader (c) 2009 Derek Barnett
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
#include "BamReader.h"
+//#include "STLUtilities.h"
+
+#include <cstring>
+
#include <iostream>
using std::cerr;
using std::endl;
const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
-BamReader::BamReader(const char* filename, const char* indexFilename)
- : m_filename( (char*)filename )
- , m_indexFilename( (char*)indexFilename )
- , m_file(0)
- , m_index(NULL)
- , m_headerText("")
- , m_isOpen(false)
+// constructor
+BamReader::BamReader(void)
+ : m_index(NULL)
, m_isIndexLoaded(false)
+ , m_alignmentsBeginOffset(0)
, m_isRegionSpecified(false)
- , m_isCalculateAlignedBases(true)
, m_currentRefID(0)
, m_currentLeft(0)
- , m_alignmentsBeginOffset(0)
-{
- Open();
-}
+{ }
+// destructor
BamReader::~BamReader(void) {
-
- // close file
- if ( m_isOpen ) { Close(); }
+ m_headerText.clear();
+ ClearIndex();
+ Close();
}
-// open BAM file
-bool BamReader::Open(void) {
-
- if (!m_isOpen && m_filename != NULL ) {
-
- // open file
- m_file = bam_open(m_filename, "r");
-
- // get header info && index info
- if ( (m_file != NULL) && LoadHeader() ) {
-
- // save file offset where alignments start
- m_alignmentsBeginOffset = bam_tell(m_file);
-
- // set open flag
- m_isOpen = true;
- }
-
- // try to open (and load) index data, if index file given
- if ( m_indexFilename != NULL ) {
- OpenIndex();
- }
- }
-
- return m_isOpen;
+// checks BGZF block header
+bool BamReader::BgzfCheckBlockHeader(char* header) {
+
+ return (header[0] == GZIP_ID1 &&
+ header[1] == (char)GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & FLG_FEXTRA) != 0 &&
+ BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
+ header[12] == BGZF_ID1 &&
+ header[13] == BGZF_ID2 &&
+ BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN);
}
-bool BamReader::OpenIndex(void) {
-
- if ( m_indexFilename && !m_isIndexLoaded ) {
- m_isIndexLoaded = LoadIndex();
- }
- return m_isIndexLoaded;
+// closes the BAM file
+void BamReader::BgzfClose(void) {
+ m_BGZF.IsOpen = false;
+ fclose(m_BGZF.Stream);
}
-// close BAM file
-bool BamReader::Close(void) {
+// de-compresses the current block
+int BamReader::BgzfInflateBlock(int blockLength) {
- if (m_isOpen) {
+ // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)m_BGZF.CompressedBlock + 18;
+ zs.avail_in = blockLength - 16;
+ zs.next_out = (Bytef*)m_BGZF.UncompressedBlock;
+ zs.avail_out = m_BGZF.UncompressedBlockSize;
+
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ if (status != Z_OK) {
+ printf("inflateInit failed\n");
+ exit(1);
+ }
- // close file
- int ret = bam_close(m_file);
-
- // delete index info
- if ( m_index != NULL) { delete m_index; }
+ status = inflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ inflateEnd(&zs);
+ printf("inflate failed\n");
+ exit(1);
+ }
- // clear open flag
- m_isOpen = false;
+ status = inflateEnd(&zs);
+ if (status != Z_OK) {
+ printf("inflateEnd failed\n");
+ exit(1);
+ }
- // clear index flag
- m_isIndexLoaded = false;
+ return zs.total_out;
+}
- // clear region flag
- m_isRegionSpecified = false;
+// opens the BAM file for reading
+void BamReader::BgzfOpen(const string& filename) {
- // return success/fail of bam_close
- return (ret == 0);
- }
+ m_BGZF.Stream = fopen(filename.c_str(), "rb");
+ if(!m_BGZF.Stream) {
+ cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl;
+ exit(1);
+ }
- return true;
+ m_BGZF.IsOpen = true;
}
-// get BAM filename
-const char* BamReader::Filename(void) const {
- return (const char*)m_filename;
-}
+// reads BGZF data into buffer
+unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
-// set BAM filename
-void BamReader::SetFilename(const char* filename) {
- m_filename = (char*)filename;
-}
+ if (dataLength == 0) { return 0; }
-// get BAM Index filename
-const char* BamReader::IndexFilename(void) const {
- return (const char*)m_indexFilename;
-}
+ char* output = data;
+ unsigned int numBytesRead = 0;
+ while (numBytesRead < dataLength) {
-// set BAM Index filename
-void BamReader::SetIndexFilename(const char* indexFilename) {
- m_indexFilename = (char*)indexFilename;
-}
+ int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
+ if (bytesAvailable <= 0) {
+ if ( BgzfReadBlock() != 0 ) { return -1; }
+ bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
+ if ( bytesAvailable <= 0 ) { break; }
+ }
-// return full header text
-const string BamReader::GetHeaderText(void) const {
- return m_headerText;
-}
+ char* buffer = m_BGZF.UncompressedBlock;
+ int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
+ memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
-// return number of reference sequences in BAM file
-const int BamReader::GetReferenceCount(void) const {
- return m_references.size();
-}
+ m_BGZF.BlockOffset += copyLength;
+ output += copyLength;
+ numBytesRead += copyLength;
+ }
-// get RefID from reference name
-const int BamReader::GetRefID(string refName) const {
-
- vector<string> refNames;
- RefVector::const_iterator refIter = m_references.begin();
- RefVector::const_iterator refEnd = m_references.end();
- for ( ; refIter != refEnd; ++refIter) {
- refNames.push_back( (*refIter).RefName );
+ if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
+ m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);
+ m_BGZF.BlockOffset = 0;
+ m_BGZF.BlockLength = 0;
}
- // return 'index-of' refName (if not found, returns refNames.size())
- return Index( refNames.begin(), refNames.end(), refName );
+ return numBytesRead;
}
-const RefVector BamReader::GetReferenceData(void) const {
- return m_references;
-}
+int BamReader::BgzfReadBlock(void) {
-bool BamReader::Jump(int refID, unsigned int left) {
+ char header[BLOCK_HEADER_LENGTH];
+ int64_t blockAddress = ftello(m_BGZF.Stream);
- // if index available, and region is valid
- if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
- m_currentRefID = refID;
- m_currentLeft = left;
- m_isRegionSpecified = true;
-
- int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
- if ( offset == -1 ) { return false; }
- else { return ( bam_seek(m_file, offset, SEEK_SET) == 0 ); }
- }
- return false;
-}
+ int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
+ if (count == 0) {
+ m_BGZF.BlockLength = 0;
+ return 0;
+ }
-bool BamReader::Rewind(void) {
+ if (count != sizeof(header)) {
+ printf("read block failed - count != sizeof(header)\n");
+ return -1;
+ }
- int refID = 0;
- int refCount = m_references.size();
- for ( ; refID < refCount; ++refID ) {
- if ( m_references.at(refID).RefHasAlignments ) { break; }
- }
+ if (!BgzfCheckBlockHeader(header)) {
+ printf("read block failed - CheckBgzfBlockHeader() returned false\n");
+ return -1;
+ }
- m_currentRefID = refID;
- m_currentLeft = 0;
- m_isRegionSpecified = false;
+ int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
+ char* compressedBlock = m_BGZF.CompressedBlock;
+ memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
+ int remaining = blockLength - BLOCK_HEADER_LENGTH;
- return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
-}
+ count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
+ if (count != remaining) {
+ printf("read block failed - count != remaining\n");
+ return -1;
+ }
-// get next alignment from specified region
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
+ count = BgzfInflateBlock(blockLength);
+ if (count < 0) { return -1; }
- // try to load 'next' read
- if ( LoadNextAlignment(bAlignment) ) {
+ if (m_BGZF.BlockLength != 0) {
+ m_BGZF.BlockOffset = 0;
+ }
- // if specified region, check for region overlap
- if ( m_isRegionSpecified ) {
+ m_BGZF.BlockAddress = blockAddress;
+ m_BGZF.BlockLength = count;
+ return 0;
+}
- // if overlap, return true
- if ( IsOverlap(bAlignment) ) { return true; }
- // if not, try the next alignment
- else { return GetNextAlignment(bAlignment); }
- }
+// move file pointer to specified offset
+bool BamReader::BgzfSeek(int64_t position) {
- // not using region, valid read detected, return success
- else { return true; }
- }
+ int blockOffset = (position & 0xFFFF);
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+ if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
+ printf("ERROR: Unable to seek in BAM file\n");
+ exit(1);
+ }
- // no valid alignment to load
- return false;
+ m_BGZF.BlockLength = 0;
+ m_BGZF.BlockAddress = blockAddress;
+ m_BGZF.BlockOffset = blockOffset;
+ return true;
}
-void BamReader::SetCalculateAlignedBases(bool flag) {
- m_isCalculateAlignedBases = flag;
+// get file position in BAM file
+int64_t BamReader::BgzfTell(void) {
+ return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
}
int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
return i;
}
-uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
+unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
// initialize alignment end to starting position
- uint32_t alignEnd = position;
+ unsigned int alignEnd = position;
// iterate over cigar operations
vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
for ( ; cigarIter != cigarEnd; ++cigarIter) {
- if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
+ char cigarType = (*cigarIter).Type;
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
alignEnd += (*cigarIter).Length;
}
}
return alignEnd;
}
-int64_t BamReader::GetOffset(int refID, unsigned int left) {
+void BamReader::ClearIndex(void) {
+
+ if ( m_index ) {
+ // iterate over references
+ vector<RefIndex*>::iterator refIter = m_index->begin();
+ vector<RefIndex*>::iterator refEnd = m_index->end();
+ for ( ; refIter != refEnd; ++refIter) {
+ RefIndex* aRef = (*refIter);
+ if ( aRef ) {
+ // clear out BAM bins
+ if ( aRef->first ) {
+ BinVector::iterator binIter = (aRef->first)->begin();
+ BinVector::iterator binEnd = (aRef->first)->end();
+ for ( ; binIter != binEnd; ++binIter ) {
+ ChunkVector* chunks = (*binIter).second;
+ if ( chunks ) { delete chunks; }
+ }
+ delete aRef->first;
+ }
+ // clear BAM linear offsets
+ if ( aRef->second ) { delete aRef->second; }
+ delete aRef;
+ }
+ }
+ delete m_index;
+ }
+}
- // make space for bins
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
-
- // returns number of bins overlapping (left, right)
- // stores indices of those bins in 'bins'
- int numBins = BinsFromRegion(refID, left, bins);
+// closes the BAM file
+void BamReader::Close(void) {
+ if(m_BGZF.IsOpen) { BgzfClose(); }
+ if(m_index) { delete m_index; m_index = NULL; }
+ m_isRegionSpecified = false;
+}
- // access index data for refID
- RefIndex* refIndex = m_index->at(refID);
+const string BamReader::GetHeaderText(void) const {
+ return m_headerText;
+}
- // get list of BAM bins for this reference sequence
- BinVector* refBins = refIndex->first;
+const int BamReader::GetReferenceCount(void) const {
+ return m_references.size();
+}
- sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+const RefVector BamReader::GetReferenceData(void) const {
+ return m_references;
+}
- // declare ChunkVector
- ChunkVector regionChunks;
+const int BamReader::GetReferenceID(const string& refName) const {
- // declaure LinearOffsetVector
- LinearOffsetVector* linearOffsets = refIndex->second;
+ // retrieve names from reference data
+ vector<string> refNames;
+ RefVector::const_iterator refIter = m_references.begin();
+ RefVector::const_iterator refEnd = m_references.end();
+ for ( ; refIter != refEnd; ++refIter) {
+ refNames.push_back( (*refIter).RefName );
+ }
- // some sort of linear offset vs bin index voodoo... not completely sure what's going here
- uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+ // return 'index-of' refName ( if not found, returns refNames.size() )
+ return Index( refNames.begin(), refNames.end(), refName );
+}
- BinVector::iterator binBegin = refBins->begin();
- BinVector::iterator binEnd = refBins->end();
+// get next alignment (from specified region, if given)
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
- // iterate over bins overlapping region, count chunks
- for (int i = 0; i < numBins; ++i) {
+ // if valid alignment available
+ if ( LoadNextAlignment(bAlignment) ) {
- // look for bin with ID=bin[i]
- BinVector::iterator binIter = binBegin;
-
- for ( ; binIter != binEnd; ++binIter ) {
+ // if region not specified, return success
+ if ( !m_isRegionSpecified ) { return true; }
- // if found, increment n_off by number of chunks for each bin
- if ( (*binIter).first == (uint32_t)bins[i] ) {
-
- // iterate over chunks in that bin
- ChunkVector* binChunks = (*binIter).second;
-
- ChunkVector::iterator chunkIter = binChunks->begin();
- ChunkVector::iterator chunkEnd = binChunks->end();
- for ( ; chunkIter != chunkEnd; ++chunkIter) {
-
- // if right bound of pair is greater than min_off (linear offset value), store pair
- if ( (*chunkIter).second > minOffset) {
- regionChunks.push_back( (*chunkIter) );
- }
- }
- }
+ // load next alignment until region overlap is found
+ while ( !IsOverlap(bAlignment) ) {
+ // if no valid alignment available (likely EOF) return failure
+ if ( !LoadNextAlignment(bAlignment) ) { return false; }
}
- }
+
+ // return success (alignment found that overlaps region)
+ return true;
+ }
+
+ // no valid alignment
+ else { return false; }
+}
- // clean up temp array
- free(bins);
+int64_t BamReader::GetOffset(int refID, unsigned int left) {
- // there should be at least 1
- if(regionChunks.size() == 0) { return -1; }
+ // calculate which bins overlap this region
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+ int numBins = BinsFromRegion(refID, left, bins);
- // sort chunks by start position
- sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
+ // get bins for this reference
+ RefIndex* refIndex = m_index->at(refID);
+ BinVector* refBins = refIndex->first;
- // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
- int numOffsets = regionChunks.size();
- for (int i = 1; i < numOffsets; ++i) {
- if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
- regionChunks.at(i-1).second = regionChunks.at(i).first;
- }
- }
+ // get minimum offset to consider
+ LinearOffsetVector* linearOffsets = refIndex->second;
+ uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+
+ // store offsets to beginning of alignment 'chunks'
+ std::vector<int64_t> chunkStarts;
- // merge adjacent chunks
- int l = 0;
- for (int i = 1; i < numOffsets; ++i) {
- // if adjacent, expand boundaries of (merged) chunk
- if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
- regionChunks.at(l).second = regionChunks.at(i).second;
+ // reference bin iterators
+ BinVector::const_iterator binIter;
+ BinVector::const_iterator binBegin = refBins->begin();
+ BinVector::const_iterator binEnd = refBins->end();
+
+ // store all alignment 'chunk' starts for bins in this region
+ for (int i = 0; i < numBins; ++i ) {
+ binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
+ if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
+ ChunkVector* binChunks = (*binIter).second;
+ ChunkVector::const_iterator chunkIter = binChunks->begin();
+ ChunkVector::const_iterator chunkEnd = binChunks->end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter) {
+ if ( (*chunkIter).second > minOffset ) {
+ chunkStarts.push_back( (*chunkIter).first );
+ }
+ }
}
- // else, move on to next (merged) chunk index
- else { regionChunks.at(++l) = regionChunks.at(i); }
}
-
- // return beginning file offset of first chunk for region
- return (int64_t)regionChunks.at(0).first;
+
+ // clean up memory
+ free(bins);
+
+ // if no alignments found
+ if ( chunkStarts.empty() ) { return -1; }
+
+ // else return smallest offset for alignment starts
+ else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
}
bool BamReader::IsOverlap(BamAlignment& bAlignment) {
// read starts after left boundary
if ( bAlignment.Position >= m_currentLeft) { return true; }
- // get alignment end
- uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
-
// return whether alignment end overlaps left boundary
- return ( alignEnd >= m_currentLeft );
+ return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
}
-bool BamReader::LoadHeader(void) {
+bool BamReader::Jump(int refID, unsigned int left) {
+
+ // if index available, and region is valid
+ if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
+ m_currentRefID = refID;
+ m_currentLeft = left;
+ m_isRegionSpecified = true;
+
+ int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
+ if ( offset == -1 ) { return false; }
+ else { return BgzfSeek(offset); }
+ }
+ return false;
+}
+void BamReader::LoadHeaderData(void) {
+
// check to see if proper BAM header
- char buf[4];
- if (bam_read(m_file, buf, 4) != 4) { return false; }
- if (strncmp(buf, "BAM\001", 4)) {
- fprintf(stderr, "wrong header type!\n");
- return false;
+ char buffer[4];
+ if (BgzfRead(buffer, 4) != 4) {
+ printf("Could not read header type\n");
+ exit(1);
+ }
+ if (strncmp(buffer, "BAM\001", 4)) {
+ printf("wrong header type!\n");
+ exit(1);
}
// get BAM header text length
- int32_t headerTextLength;
- bam_read(m_file, &headerTextLength, 4);
+ BgzfRead(buffer, 4);
+ const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
// get BAM header text
char* headerText = (char*)calloc(headerTextLength + 1, 1);
- bam_read(m_file, headerText, headerTextLength);
+ BgzfRead(headerText, headerTextLength);
m_headerText = (string)((const char*)headerText);
// clean up calloc-ed temp variable
free(headerText);
-
- // get number of reference sequences
- int32_t numberRefSeqs;
- bam_read(m_file, &numberRefSeqs, 4);
- if (numberRefSeqs == 0) { return false; }
-
- m_references.reserve((int)numberRefSeqs);
-
- // reference variables
- int32_t refNameLength;
- char* refName;
- uint32_t refLength;
-
- // iterate over all references in header
- for (int i = 0; i != numberRefSeqs; ++i) {
-
- // get length of reference name
- bam_read(m_file, &refNameLength, 4);
- refName = (char*)calloc(refNameLength, 1);
-
- // get reference name and reference sequence length
- bam_read(m_file, refName, refNameLength);
- bam_read(m_file, &refLength, 4);
-
- // store data for reference
- RefData aReference;
- aReference.RefName = (string)((const char*)refName);
- aReference.RefLength = refLength;
- m_references.push_back(aReference);
-
- // clean up calloc-ed temp variable
- free(refName);
- }
-
- return true;
}
-bool BamReader::LoadIndex(void) {
-
- // check to see if index file exists
- FILE* indexFile;
- if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
- fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
- return false;
- }
+void BamReader::LoadIndexData(FILE* indexStream) {
// see if index is valid BAM index
char magic[4];
- fread(magic, 1, 4, indexFile);
+ fread(magic, 1, 4, indexStream);
if (strncmp(magic, "BAI\1", 4)) {
- fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
- fclose(indexFile);
- return false;
+ printf("Problem with index file - invalid format.\n");
+ fclose(indexStream);
+ exit(1);
}
// get number of reference sequences
uint32_t numRefSeqs;
- fread(&numRefSeqs, 4, 1, indexFile);
+ fread(&numRefSeqs, 4, 1, indexStream);
// intialize BamIndex data structure
m_index = new BamIndex;
// get number of bins for this reference sequence
int32_t numBins;
- fread(&numBins, 4, 1, indexFile);
+ fread(&numBins, 4, 1, indexStream);
if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
// get binID
uint32_t binID;
- fread(&binID, 4, 1, indexFile);
+ fread(&binID, 4, 1, indexStream);
// get number of regionChunks in this bin
uint32_t numChunks;
- fread(&numChunks, 4, 1, indexFile);
+ fread(&numChunks, 4, 1, indexStream);
// intialize ChunkVector
ChunkVector* regionChunks = new ChunkVector;
// get chunk boundaries (left, right)
uint64_t left;
uint64_t right;
- fread(&left, 8, 1, indexFile);
- fread(&right, 8, 1, indexFile);
+ fread(&left, 8, 1, indexStream);
+ fread(&right, 8, 1, indexStream);
// save ChunkPair
regionChunks->push_back( ChunkPair(left, right) );
}
+ // sort chunks for this bin
+ sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
+
// save binID, chunkVector for this bin
bins->push_back( BamBin(binID, regionChunks) );
}
+ // sort bins by binID
+ sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+
// load linear index for this reference sequence
// get number of linear offsets
int32_t numLinearOffsets;
- fread(&numLinearOffsets, 4, 1, indexFile);
+ fread(&numLinearOffsets, 4, 1, indexStream);
// intialize LinearOffsetVector
LinearOffsetVector* linearOffsets = new LinearOffsetVector;
for (int j = 0; j < numLinearOffsets; ++j) {
// get a linear offset
uint64_t linearOffset;
- fread(&linearOffset, 8, 1, indexFile);
+ fread(&linearOffset, 8, 1, indexStream);
// store linear offset
linearOffsets->push_back(linearOffset);
}
+ // sort linear offsets
+ sort( linearOffsets->begin(), linearOffsets->end() );
+
// store index data for that reference sequence
m_index->push_back( new RefIndex(bins, linearOffsets) );
}
// close index file (.bai) and return
- fclose(indexFile);
- return true;
+ fclose(indexStream);
}
bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
- // check valid alignment block header data
- int32_t block_len;
- int32_t ret;
- uint32_t x[8];
-
- int32_t bytesRead = 0;
-
// read in the 'block length' value, make sure it's not zero
- if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; }
- bytesRead += 4;
+ char buffer[4];
+ BgzfRead(buffer, 4);
+ const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
+ if ( blockLength == 0 ) { return false; }
+
+ // keep track of bytes read as method progresses
+ int bytesRead = 4;
- // read in core alignment data, make the right size of data was read
- if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
+ // read in core alignment data, make sure the right size of data was read
+ char x[BAM_CORE_SIZE];
+ if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
bytesRead += BAM_CORE_SIZE;
- // set BamAlignment 'core' data
- bAlignment.RefID = x[0];
- bAlignment.Position = x[1];
- bAlignment.Bin = x[2]>>16;
- bAlignment.MapQuality = x[2]>>8&0xff;
- bAlignment.AlignmentFlag = x[3]>>16;
- bAlignment.MateRefID = x[5];
- bAlignment.MatePosition = x[6];
- bAlignment.InsertSize = x[7];
-
- // fetch & store often-used lengths for character data parsing
- unsigned int queryNameLength = x[2]&0xff;
- unsigned int numCigarOperations = x[3]&0xffff;
- unsigned int querySequenceLength = x[4];
+ // set BamAlignment 'core' data and character data lengths
+ unsigned int tempValue;
+ unsigned int queryNameLength;
+ unsigned int numCigarOperations;
+ unsigned int querySequenceLength;
+
+ bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]);
+ bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
+
+ tempValue = BgzfUnpackUnsignedInt(&x[8]);
+ bAlignment.Bin = tempValue >> 16;
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;
+ queryNameLength = tempValue & 0xff;
+
+ tempValue = BgzfUnpackUnsignedInt(&x[12]);
+ bAlignment.AlignmentFlag = tempValue >> 16;
+ numCigarOperations = tempValue & 0xffff;
+
+ querySequenceLength = BgzfUnpackUnsignedInt(&x[16]);
+ bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]);
+ bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
+ bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]);
+
+ // calculate lengths/offsets
+ const unsigned int dataLength = blockLength - BAM_CORE_SIZE;
+ const unsigned int cigarDataOffset = queryNameLength;
+ const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);
+ const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;
+ const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;
+ const unsigned int tagDataLen = dataLength - tagDataOffset;
- // get length of character data
- int dataLength = block_len - BAM_CORE_SIZE;
-
// set up destination buffers for character data
- uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
- uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength);
-
- const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
- const unsigned int tagDataLen = dataLength - tagDataOffset;
- char* tagData = ((char*)allCharData) + tagDataOffset;
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
+ char* seqData = ((char*)allCharData) + seqDataOffset;
+ char* qualData = ((char*)allCharData) + qualDataOffset;
+ char* tagData = ((char*)allCharData) + tagDataOffset;
// get character data - make sure proper data size was read
- if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
+ if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
else {
-
+
bytesRead += dataLength;
-
- // clear out bases, qualities, aligned bases, CIGAR, and tag data
+
+ // clear out any previous string data
+ bAlignment.Name.clear();
bAlignment.QueryBases.clear();
bAlignment.Qualities.clear();
bAlignment.AlignedBases.clear();
bAlignment.CigarData.clear();
bAlignment.TagData.clear();
-
+
// save name
bAlignment.Name = (string)((const char*)(allCharData));
- // save bases
- char singleBase[2];
- uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
- for (unsigned int i = 0; i < querySequenceLength; ++i) {
- // numeric to char conversion
- sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
- // append character data to Bases
- bAlignment.QueryBases.append( (const char*)singleBase );
+ // save query sequence
+ for (unsigned int i = 0; i < querySequenceLength; ++i) {
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+ bAlignment.QueryBases.append( 1, singleBase );
}
// save sequence length
bAlignment.Length = bAlignment.QueryBases.length();
// save qualities
- char singleQuality[4];
- uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
for (unsigned int i = 0; i < querySequenceLength; ++i) {
- // numeric to char conversion
- sprintf( singleQuality, "%c", ( t[i]+33 ) );
- // append character data to Qualities
- bAlignment.Qualities.append( (const char*)singleQuality );
+ char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character
+ bAlignment.Qualities.append( 1, singleQuality );
}
// save CIGAR-related data;
int k = 0;
for (unsigned int i = 0; i < numCigarOperations; ++i) {
-
+
// build CigarOp struct
CigarOp op;
op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
-
+
// save CigarOp
bAlignment.CigarData.push_back(op);
-
- // can skip this step if user wants to ignore
- if (m_isCalculateAlignedBases) {
-
- // build AlignedBases string
- switch (op.Type) {
-
- case ('M') :
- case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
- case ('S') : k += op.Length; // for 'S' - skip over query bases
- break;
-
- case ('D') :
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
- break;
-
- case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
- k += op.Length;
- break;
-
- case ('H') : break; // for 'H' - do nothing, move to next op
-
- default : assert(false); // shouldn't get here
- break;
- }
+
+ // build AlignedBases string
+ switch (op.Type) {
+
+ case ('M') :
+ case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
+ case ('S') : k += op.Length; // for 'S' - skip over query bases
+ break;
+
+ case ('D') :
+ case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
+ break;
+
+ case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
+ k += op.Length;
+ break;
+
+ case ('H') : break; // for 'H' - do nothing, move to next op
+
+ default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
+ exit(1);
}
}
-
+
// read in the tag data
bAlignment.TagData.resize(tagDataLen);
memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
}
+
free(allCharData);
+ return true;
+}
-/*
- // (optional) read tag parsing data
- string tag;
- char data;
- int i = 0;
+void BamReader::LoadReferenceData(void) {
- // still data to read (auxiliary tags)
- while ( bytesRead < block_len ) {
+ // get number of reference sequences
+ char buffer[4];
+ BgzfRead(buffer, 4);
+ const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
+ if (numberRefSeqs == 0) { return; }
+ m_references.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {
- if ( bam_read(m_file, &data, 1) == 1 ) {
-
- ++bytesRead;
+ // get length of reference name
+ BgzfRead(buffer, 4);
+ const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
+ char* refName = (char*)calloc(refNameLength, 1);
+
+ // get reference name and reference sequence length
+ BgzfRead(refName, refNameLength);
+ BgzfRead(buffer, 4);
+ const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName);
+ aReference.RefLength = refLength;
+ m_references.push_back(aReference);
+
+ // clean up calloc-ed temp variable
+ free(refName);
+ }
+}
- if (bytesRead == block_len && data != '\0') {
- fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
- exit(1);
- }
+// opens BAM file (and index)
+void BamReader::Open(const string& filename, const string& indexFilename) {
- tag.append(1, data);
- if ( data == '\0' ) {
- bAlignment.Tags.push_back(tag);
- tag = "";
- i = 0;
- } else {
- if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
- ++i;
- }
- } else {
- fprintf(stderr, "ERROR: Could not read tag info.\n");
+ // open the BGZF file for reading, retrieve header text & reference data
+ BgzfOpen(filename);
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store file offset of first alignment
+ m_alignmentsBeginOffset = BgzfTell();
+
+ // open index file & load index data (if exists)
+ OpenIndex(indexFilename);
+}
+
+void BamReader::OpenIndex(const string& indexFilename) {
+
+ // if index file exists
+ if (!indexFilename.empty()) {
+
+ // open index
+ FILE* indexStream = fopen(indexFilename.c_str(), "rb");
+
+ // abort on error
+ if(!indexStream) {
+ cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl;
exit(1);
}
+
+ // build up index data structure
+ LoadIndexData(indexStream);
}
-*/
- return true;
}
+
+bool BamReader::Rewind(void) {
+
+ // find first reference that has alignments in the BAM file
+ int refID = 0;
+ int refCount = m_references.size();
+ for ( ; refID < refCount; ++refID ) {
+ if ( m_references.at(refID).RefHasAlignments ) { break; }
+ }
+
+ // store default bounds for first alignment
+ m_currentRefID = refID;
+ m_currentLeft = 0;
+ m_isRegionSpecified = false;
+
+ // return success/failure of seek
+ return BgzfSeek(m_alignmentsBeginOffset);
+}
-// BamReader.h\r
-\r
-/* The MIT License\r
-\r
- Copyright (c) 2008 Genome Research Ltd (GRL).\r
-\r
- Permission is hereby granted, free of charge, to any person obtaining\r
- a copy of this software and associated documentation files (the\r
- "Software"), to deal in the Software without restriction, including\r
- without limitation the rights to use, copy, modify, merge, publish,\r
- distribute, sublicense, and/or sell copies of the Software, and to\r
- permit persons to whom the Software is furnished to do so, subject to\r
- the following conditions:\r
-\r
- The above copyright notice and this permission notice shall be\r
- included in all copies or substantial portions of the Software.\r
-\r
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
- SOFTWARE.\r
-*/\r
-\r
-/*\r
- Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package \r
- (thus the carryover of above MIT license)\r
- Contact: Derek Barnett <barnetde@bc.edu>\r
-*/\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 23 April 2009\r
-\r
-#ifndef BAMREADER_H\r
-#define BAMREADER_H\r
-\r
-// custom includes\r
-#include "BamAlignment.h"\r
-#include "STLUtilities.h"\r
-\r
-// C library includes\r
-#include <assert.h>\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#include <string.h>\r
-\r
-#ifdef WIN32\r
-typedef char int8_t;\r
-typedef unsigned char uint8_t;\r
-typedef short int16_t;\r
-typedef unsigned short uint16_t;\r
-typedef int int32_t;\r
-typedef unsigned int uint32_t;\r
-typedef long long int64_t;\r
-typedef unsigned long long uint64_t;\r
-#else\r
-#include <stdint.h>\r
-#endif\r
-\r
-// BGZF library includes/defines\r
-#include "bgzf.h"\r
-typedef BGZF* BamFile;\r
-#define bam_open(f_name, mode) bgzf_open(f_name, mode)\r
-#define bam_close(f_ptr) bgzf_close(f_ptr)\r
-#define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size)\r
-#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)\r
-#define bam_tell(f_ptr) bgzf_tell(f_ptr)\r
-#define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir)\r
-\r
-// size of alignment data block in BAM file (bytes)\r
-#define BAM_CORE_SIZE 32\r
-\r
-// BAM indexing constants\r
-#define MAX_BIN 37450 // =(8^6-1)/7+1\r
-#define BAM_MIN_CHUNK_GAP 32768 \r
-#define BAM_LIDX_SHIFT 14\r
-\r
-// CIGAR-retrieval mask/shift constants\r
-#define BAM_CIGAR_SHIFT 4\r
-#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)\r
-\r
-// CIGAR-operation types\r
-#define BAM_CMATCH 0\r
-#define BAM_CINS 1\r
-#define BAM_CDEL 2\r
-#define BAM_CREF_SKIP 3\r
-#define BAM_CSOFT_CLIP 4\r
-#define BAM_CHARD_CLIP 5\r
-#define BAM_CPAD 6\r
-\r
-// --------------------------- //\r
-// Bam header info\r
-// --------------------------- //\r
-\r
-// --------------------------- //\r
-// BamIndex-related typedefs\r
-// --------------------------- //\r
-\r
-// offset for linear indexing\r
-typedef vector<uint64_t> LinearOffsetVector;\r
-\r
-// chunk boundaries\r
-typedef pair<uint64_t, uint64_t> ChunkPair;\r
-// list of chunks in a BAM bin\r
-typedef vector<ChunkPair> ChunkVector;\r
-\r
-// BAM bins for a reference sequence\r
-// replaces khash - uint32_t is key, ChunkVector is value\r
-typedef pair<uint32_t, ChunkVector*> BamBin;\r
-typedef vector<BamBin> BinVector;\r
-\r
-// each reference sequence has a BinVector and LinearOffsetVector\r
-typedef pair<BinVector*, LinearOffsetVector*> RefIndex;\r
-\r
-// full BamIndex defined as: \r
-typedef vector<RefIndex*> BamIndex;\r
-\r
-// ---------------------------------------------------------------------------//\r
-\r
-class BamReader {\r
- \r
- public:\r
- // constructors\r
- BamReader(const char* fileName = NULL, const char* indexFilename = NULL);\r
-\r
- public:\r
- // destructor\r
- ~BamReader(void);\r
- \r
- // BAM interface methods\r
- public:\r
-\r
- // ----------------------- //\r
- // File manipulation\r
- // ----------------------- //\r
- \r
- // open BAM file (automatically opens index if provided)\r
- bool Open(void);\r
- \r
- // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)\r
- bool OpenIndex(void);\r
- \r
- // close BAM file\r
- bool Close(void);\r
- \r
- // get BAM filename\r
- const char* Filename(void) const;\r
- \r
- // set BAM filename\r
- void SetFilename(const char*);\r
-\r
- // get BAM Index filename\r
- const char* IndexFilename(void) const;\r
- \r
- // set BAM Index filename\r
- void SetIndexFilename(const char*);\r
-\r
- // ----------------------- //\r
- // Access BAM header\r
- // ----------------------- //\r
- \r
- // return full header text\r
- const string GetHeaderText(void) const;\r
- \r
- // --------------------------------- //\r
- // Access reference sequence info\r
- // --------------------------------- //\r
- \r
- // return number of reference sequences in BAM file\r
- const int GetReferenceCount(void) const;\r
-\r
- // return vector of RefData entries\r
- const RefVector GetReferenceData(void) const;\r
-\r
- // get refID from reference name\r
- const int GetRefID(string refName) const; \r
- \r
- // ----------------------------------------- //\r
- // File position moving\r
- // ----------------------------------------- //\r
-\r
- // jumps to 'left' position on refID\r
- // actually jumps before position, so reads that overlap 'left' are included as well\r
- // 'left' defaults to reference begin if not specified\r
- bool Jump(int refID, unsigned int left = 0);\r
-\r
- // Jump to beginning of BAM file, clears any region previously set by Jump()\r
- bool Rewind(void);\r
- \r
- // ------------------------------ //\r
- // Access alignments\r
- // ------------------------------ //\r
- \r
- // get next alignment\r
- bool GetNextAlignment(BamAlignment& read);\r
-\r
- // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded\r
- void SetCalculateAlignedBases(bool);\r
-\r
- // internal utility methods\r
- private:\r
- int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);\r
- uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);\r
- int64_t GetOffset(int, unsigned int);\r
- bool IsOverlap(BamAlignment&);\r
- bool LoadHeader(void);\r
- bool LoadIndex(void);\r
- bool LoadNextAlignment(BamAlignment&);\r
-\r
- private: \r
- // main BAM reader components\r
- char* m_filename;\r
- char* m_indexFilename;\r
- BamFile m_file;\r
- BamIndex* m_index;\r
- RefVector m_references;\r
- string m_headerText;\r
-\r
- // state flags\r
- bool m_isOpen; // BAM file is open for processing\r
- bool m_isIndexLoaded; // BAM Index data is loaded and available for processing\r
- bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump()\r
- bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)\r
-\r
- // region values\r
- int m_currentRefID;\r
- unsigned int m_currentLeft;\r
-\r
- // file offset of 1st read in BAM file\r
- int64_t m_alignmentsBeginOffset;\r
-\r
- private:\r
- // BAM character constants\r
- static const char* DNA_LOOKUP;\r
- static const char* CIGAR_LOOKUP;\r
-};\r
-\r
-#endif /* BAMREADER_H */\r
+// ***************************************************************************
+// BamReader (c) 2009 Derek Barnett
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace std;
+
+#include "BamAlignment.h"
+
+// our zlib constants
+#define GZIP_ID1 31
+#define GZIP_ID2 139
+#define CM_DEFLATE 8
+#define FLG_FEXTRA 4
+#define OS_UNKNOWN 255
+#define BGZF_XLEN 6
+#define BGZF_ID1 66
+#define BGZF_ID2 67
+#define BGZF_LEN 2
+#define GZIP_WINDOW_BITS -15
+#define Z_DEFAULT_MEM_LEVEL 8
+
+// our BZGF constants
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+#define MAX_BLOCK_SIZE 65536
+#define DEFAULT_BLOCK_SIZE 65536
+
+// our BAM constants
+#define BAM_CORE_SIZE 32
+#define BAM_CMATCH 0
+#define BAM_CINS 1
+#define BAM_CDEL 2
+#define BAM_CREF_SKIP 3
+#define BAM_CSOFT_CLIP 4
+#define BAM_CHARD_CLIP 5
+#define BAM_CPAD 6
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+
+// BAM indexing constants
+#define MAX_BIN 37450 // =(8^6-1)/7+1
+#define BAM_MIN_CHUNK_GAP 32768
+#define BAM_LIDX_SHIFT 14
+
+// our variable sizes
+#define SIZEOF_INT 4
+
+// define our BZGF structure
+#ifndef BGZF_DATA
+#define BGZF_DATA
+struct BgzfData {
+ unsigned int UncompressedBlockSize;
+ unsigned int CompressedBlockSize;
+ unsigned int BlockLength;
+ unsigned int BlockOffset;
+ uint64_t BlockAddress;
+ bool IsOpen;
+ FILE* Stream;
+ char* UncompressedBlock;
+ char* CompressedBlock;
+
+ // constructor
+ BgzfData(void)
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+ , CompressedBlockSize(MAX_BLOCK_SIZE)
+ , BlockLength(0)
+ , BlockOffset(0)
+ , BlockAddress(0)
+ , IsOpen(false)
+ , Stream(NULL)
+ , UncompressedBlock(NULL)
+ , CompressedBlock(NULL)
+ {
+ try {
+ CompressedBlock = new char[CompressedBlockSize];
+ UncompressedBlock = new char[UncompressedBlockSize];
+ } catch(bad_alloc&) {
+ printf("ERROR: Unable to allocate memory for our BGZF object.\n");
+ exit(1);
+ }
+ }
+
+ // destructor
+ ~BgzfData(void) {
+ if(CompressedBlock) delete [] CompressedBlock;
+ if(UncompressedBlock) delete [] UncompressedBlock;
+ }
+};
+#endif // BGZF_DATA
+
+
+// --------------------------- //
+// BamIndex-related typedefs
+// --------------------------- //
+
+// offset for linear indexing
+typedef vector<uint64_t> LinearOffsetVector;
+
+// alignment 'chunk' boundaries
+typedef pair<uint64_t, uint64_t> ChunkPair;
+typedef vector<ChunkPair> ChunkVector;
+
+// BAM bins.. each contains (binID, alignment 'chunks')
+typedef pair<uint32_t, ChunkVector*> BamBin;
+typedef vector<BamBin> BinVector;
+
+// each reference sequence has a BinVector and LinearOffsetVector
+typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
+
+// full BamIndex defined as:
+typedef vector<RefIndex*> BamIndex;
+
+class BamReader {
+
+ // constructor/destructor
+ public:
+ BamReader(void);
+ ~BamReader(void);
+
+ // public interface
+ public:
+ // closes the BAM file
+ void Close(void);
+ // retrieves header text
+ const string GetHeaderText(void) const;
+ // saves the alignment to the alignment archive
+ bool GetNextAlignment(BamAlignment& bAlignment);
+ // return number of reference sequences in BAM file
+ const int GetReferenceCount(void) const;
+ // return vector of RefData entries
+ const RefVector GetReferenceData(void) const;
+ // get refID from reference name
+ const int GetReferenceID(const string& refName) const;
+ // jumps to 'left' position on refID
+ bool Jump(int refID, unsigned int left = 0);
+ // opens the BAM file
+ void Open(const string& filename, const string& indexFilename = "");
+ // move file pointer back to first alignment
+ bool Rewind(void);
+
+ // internal methods
+ private:
+ // checks BGZF block header
+ bool BgzfCheckBlockHeader(char* header);
+ // closes the BAM file
+ void BgzfClose(void);
+ // de-compresses the current block
+ int BgzfInflateBlock(int blockLength);
+ // opens the BAM file for reading
+ void BgzfOpen(const string& filename);
+ // reads BGZF data into a byte buffer
+ unsigned int BgzfRead(char* data, const unsigned int dataLen);
+ // reads BGZF block
+ int BgzfReadBlock(void);
+ // seek to position in BAM file
+ bool BgzfSeek(int64_t position);
+ // get file position in BAM file
+ int64_t BgzfTell(void);
+ // unpacks a buffer into an unsigned int
+ static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
+ // unpacks a buffer into an unsigned short
+ static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
+ // calculate bins that overlap region ( left to reference end for now )
+ int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
+ // calculates alignment end position based on starting position and provided CIGAR operations
+ unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
+ // clear out (delete pointers in) index data structure
+ void ClearIndex(void);
+ // calculate file offset for first alignment chunk overlapping 'left'
+ int64_t GetOffset(int refID, unsigned int left);
+ // checks to see if alignment overlaps current region
+ bool IsOverlap(BamAlignment& bAlignment);
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // builds BamIndex data structure from BAM index file
+ void LoadIndexData(FILE* indexStream);
+ // retrieves BAM alignment under file pointer
+ bool LoadNextAlignment(BamAlignment& bAlignment);
+ // builds reference data structure from BAM file
+ void LoadReferenceData(void);
+ // open BAM index file (if successful, loads index)
+ void OpenIndex(const string& indexFilename);
+
+ // aligment file & index file data
+ private:
+ BgzfData m_BGZF;
+ string m_headerText;
+ BamIndex* m_index;
+ RefVector m_references;
+ bool m_isIndexLoaded;
+ int64_t m_alignmentsBeginOffset;
+
+ // user-specified region values
+ private:
+ bool m_isRegionSpecified;
+ int m_currentRefID;
+ unsigned int m_currentLeft;
+
+ // BAM character constants
+ private:
+ static const char* DNA_LOOKUP;
+ static const char* CIGAR_LOOKUP;
+};
+
+// unpacks a buffer into an unsigned int
+inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+// unpacks a buffer into an unsigned short
+inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+// allows sorting/searching of a vector of pairs (instead of using maps)
+template <typename Key, typename Value>
+class LookupKeyCompare {
+
+ typedef pair< Key, Value > LookupData;
+ typedef typename LookupData::first_type Key_t;
+
+ public:
+ bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
+ bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
+ bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
+ private:
+ bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
+};
+
+// return index of item if found, else return container.size()
+template < typename InputIterator, typename EqualityComparable >
+typename iterator_traits<InputIterator>::difference_type
+Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
+ return distance(begin, find(begin, end, item));
+}
+
\r
// Derek Barnett\r
// Marth Lab, Boston College\r
-// Last modified: 6 April 2009\r
+// Last modified: 12 May 2009\r
\r
#include "BamReader.h"\r
\r
const char* bamFilename = argv[1];\r
const char* bamIndexFilename = argv[2];\r
\r
- string refName;\r
- int refID;\r
-\r
- int alignmentCount;\r
-\r
- vector<string> refNames;\r
-\r
+ int alignmentsRead = 0;\r
BamAlignment bAlignment;\r
- BamAlignmentVector alignments;\r
-\r
- RefVector references;\r
-\r
- // ------------------------------------------------------------------------------------------------------ //\r
- // Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information\r
- // ------------------------------------------------------------------------------------------------------ //\r
\r
- BamReader bReader(bamFilename, bamIndexFilename);\r
+ BamReader bReader;\r
+ cerr << endl << "Opening BAM file (and index)" << endl << endl;\r
+ bReader.Open(bamFilename, bamIndexFilename);\r
\r
- cerr << endl;\r
- cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... ";\r
- \r
- if ( bReader.Open() ) { \r
- cerr << "ok" << endl; \r
- }\r
- else {\r
- cerr << "error" << endl;\r
- exit(1);\r
+ string header = bReader.GetHeaderText();\r
+ cerr << "Printing header text..." << endl;\r
+ if ( header.empty() ) {\r
+ cerr << "No header provided." << endl << endl;\r
+ } else {\r
+ cerr << "----------------------" << endl;\r
+ cerr << "Header Text: " << endl;\r
+ cerr << "----------------------" << endl;\r
+ cerr << header << endl << endl;\r
}\r
- \r
- // ------------------------------------------------------------ //\r
- // Find out how many reference sequences are in BAM file\r
- // ------------------------------------------------------------ //\r
- \r
- references = bReader.GetReferenceData();\r
\r
- cerr << endl;\r
- cerr << "Total number of ref seqs: " << references.size() << endl;\r
- \r
- // ---------------------------------------------------------------------------- //\r
- // Get the names/lengths of all the reference sequences that have alignments\r
- // ---------------------------------------------------------------------------- //\r
- \r
- cerr << endl;\r
- cerr << "All ref sequences with alignments:" << endl;\r
+ RefVector references = bReader.GetReferenceData();\r
+ cerr << "Printing references..." << endl;\r
+ RefVector::const_iterator refIter = references.begin();\r
+ RefVector::const_iterator refEnd = references.end();\r
+ for ( ; refIter != refEnd; ++refIter) {\r
+ cerr << "Reference entry: " << endl;\r
+ cerr << (*refIter).RefName << endl;\r
+ cerr << (*refIter).RefLength << endl;\r
+ cerr << "Has alignments? : " << ( ((*refIter).RefHasAlignments) ? "yes" : "no" ) << endl;\r
+ }\r
cerr << endl;\r
\r
+ alignmentsRead = 0;\r
+ while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
+ cerr << "Alignment " << alignmentsRead << endl;\r
+ cerr << bAlignment.Name << endl;\r
+ cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
+ ++alignmentsRead;\r
+ }\r
\r
- if ( !references.empty() ) {\r
- \r
- RefVector::iterator refIter = references.begin();\r
- RefVector::iterator refEnd = references.end();\r
- \r
- refID = 0;\r
- // iterate over reference names, print to STDERR\r
- for( ; refIter != refEnd; ++refIter) {\r
-\r
- if ( (*refIter).RefHasAlignments ) {\r
- cerr << "ID: " << refID << endl;\r
- cerr << "Name: " << (*refIter).RefName << endl;\r
- cerr << "Length: " << (*refIter).RefLength << endl;\r
- cerr << endl;\r
+ cerr << "Jumping in BAM file" << endl;\r
+ if ( bReader.Jump(1, 5000) ) {\r
+ cerr << "Jumping seems ok - getting first 10 alignments that start at or after chr2:5000" << endl;\r
+\r
+ alignmentsRead = 0;\r
+ while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
+ if ( bAlignment.Position >= 5000 ) { \r
+ cerr << "Alignment " << alignmentsRead << endl;\r
+ cerr << bAlignment.Name << endl;\r
+ cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
+ ++alignmentsRead;\r
}\r
-\r
- ++refID;\r
}\r
}\r
\r
- // --------------------------------------------------------------------------------------------- //\r
- // Get the SAM-style header text, if available (not available in the example file shown here)\r
- // --------------------------------------------------------------------------------------------- // \r
- \r
- cerr << "------------------------------------" << endl;\r
- cerr << "SAM header text: " << endl;\r
- \r
- // get (SAM) header text\r
- string header = bReader.GetHeaderText();\r
- \r
- cerr << ( (header.empty()) ? "no header data" : header ) << endl;\r
- cerr << "------------------------------------" << endl;\r
-\r
- // --------------------------------------------------------------------------------------------- //\r
- // Here we start accessing alignments\r
- // The first method shows how to iterate over all reads in a BamFile\r
- // This method will work on any BAM file (sorted/non-sorted, with/without an index)\r
- // --------------------------------------------------------------------------------------------- //\r
-\r
- // Call Rewind() to make sure you're at the 1st alignment in the file\r
- // Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment\r
- // but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code\r
-\r
- cerr << "Getting (up to) first 1000 alignments" << endl;\r
-\r
- // start at 1st alignment\r
+ cerr << "Rewinding BAM file" << endl;\r
if ( bReader.Rewind() ) {\r
+ cerr << "Rewind seems to be ok - getting first 10 alignments" << endl;\r
\r
- alignmentCount = 0;\r
- while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
- \r
- // disregard unmapped alignments\r
- if ( bAlignment.IsMapped() ) {\r
-\r
- ++alignmentCount;\r
- \r
- cout << "----------------------------" << endl;\r
- cout << "Alignment " << alignmentCount << endl;\r
- cout << bAlignment.Name << endl;\r
- cout << bAlignment.AlignedBases << endl;\r
- cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
- \r
- cout << "Cigar Data: " << endl;\r
-\r
- vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
- vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
- cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl;\r
- }\r
-\r
- if(!bAlignment.TagData.empty()) {\r
- cout << "Tag data is present." << endl;\r
- string readGroup;\r
- if(bAlignment.GetReadGroup(readGroup)) {\r
- cout << "- read group: " << readGroup << endl;\r
- }\r
- }\r
- }\r
+ alignmentsRead = 0;\r
+ while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
+ cerr << "Alignment " << alignmentsRead << endl;\r
+ cerr << bAlignment.Name << endl;\r
+ cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
+ ++alignmentsRead;\r
}\r
-\r
- cerr << "Found " << alignmentCount << " alignments." << endl;\r
- } else { cerr << "Could not rewind" << endl; }\r
-\r
- // ---------------------------------------------------------------------------------------------------------- //\r
- // You can iterate over each individual alignment that overlaps a specified region\r
- // Set the refID & left boundary parameters using Jump(refID, left)\r
- // Jump() actually positions the file pointer actually before the left boundary\r
- // This ensures that reads beginning before, but overlapping 'left' are included as well \r
- // Client is responsible for setting/checking right boundary - see below\r
- // ---------------------------------------------------------------------------------------------------------- //\r
-/*\r
- cerr << "Jumping to region" << endl;\r
-\r
- // get refID using a known refName\r
- refName = "seq1";\r
- refID = bReader.GetRefID(refName);\r
- if (refID == (int)references.size()) { \r
- cerr << "Reference " << refName << " not found." << endl;\r
- exit(1);\r
}\r
\r
- // set left boundary\r
- unsigned int leftBound = 500;\r
-\r
- // set right boundary - either user-specified number to set a discrete region\r
- // OR you can query the BamReader for the end of the reference\r
- unsigned int rightBound = references.at(refID).RefLength;\r
-\r
- cerr << endl;\r
- cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl;\r
-\r
- // set region - specific region on reference\r
- if ( bReader.Jump(refID, leftBound) ) { \r
-\r
- alignmentCount = 0;\r
- while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) {\r
- \r
- if ( bAlignment.IsMapped() ) {\r
-\r
- ++alignmentCount;\r
- \r
- if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; }\r
- \r
- cout << "----------------------------" << endl;\r
- cout << "Alignment " << alignmentCount << endl;\r
- cout << bAlignment.Name << endl;\r
- cout << bAlignment.AlignedBases << endl;\r
- cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
- }\r
- }\r
-\r
- cerr << "Found " << alignmentCount << " alignments." << endl; \r
- } else { cerr << "Could not jump to region specified" << endl; }\r
-\r
- // ----------------------------------------------------------------------------------------------------- //\r
- // You can 'rewind' back to beginning of BAM file at any time \r
- // ----------------------------------------------------------------------------------------------------- //\r
-\r
- cerr << endl;\r
- cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl;\r
-\r
- alignmentCount = 0;\r
- if (bReader.Rewind() ) {\r
- while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
- \r
- // disregard unmapped alignments\r
- if ( bAlignment.IsMapped() ) {\r
-\r
- ++alignmentCount;\r
- \r
- cout << "----------------------------" << endl;\r
- cout << "Alignment " << alignmentCount << endl;\r
- cout << bAlignment.Name << endl;\r
- cout << bAlignment.AlignedBases << endl;\r
- cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
- }\r
- }\r
-\r
- cerr << "Found " << alignmentCount << " alignments." << endl;\r
- } else { cerr << "Could not rewind" << endl; }\r
-*/\r
- // ---------------------------------------------------------------------- //\r
- // Close BamReader object (releases internal header/index data) and exit\r
- // ---------------------------------------------------------------------- //\r
- \r
- cerr << endl;\r
- cerr << "Closing BAM file: " << bamFilename << endl;\r
- \r
+ cerr << "Closing BAM file..." << endl << endl;\r
bReader.Close();\r
\r
cerr << "Exiting..." << endl << endl;\r
#define SIZEOF_INT 4
// define our BZGF structure
+#ifndef BGZF_DATA
+#define BGZF_DATA
struct BgzfData {
unsigned int UncompressedBlockSize;
unsigned int CompressedBlockSize;
if(UncompressedBlock) delete [] UncompressedBlock;
}
};
+#endif // BGZF_DATA
class BamWriter {
public:
-CC= gcc\r
CXX= g++\r
-CFLAGS= -Wall -O3\r
-CXXFLAGS= $(CFLAGS)\r
-DFLAGS= -D_IOLIB=2 #-D_NDEBUG\r
-OBJS= BamReader.o bgzf.o\r
-PROG= BamReaderTest\r
-INCLUDES= \r
-ARFLAGS= -crs\r
+CXXFLAGS= -Wall -O3\r
+PROG= BamReaderTest BamConversion\r
LIBS= -lz\r
-SUBDIRS= .\r
\r
-.SUFFIXES:.c .cpp .o\r
+all: $(PROG)\r
\r
-.c.o:\r
- $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@\r
+BamReaderTest: BamReader.o BamReaderMain.o\r
+ $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamReaderMain.o $(LIBS)\r
\r
-.cpp.o:\r
- $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@\r
-\r
-all: $(PROG) BamConversion\r
-\r
-lib:libbambc.a\r
-\r
-libbambc.a:$(OBJS)\r
- $(AR) $(ARFLAGS) $@ $(OBJS)\r
-\r
-BamReaderTest:lib BamReaderMain.o\r
- $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc\r
-\r
-BamConversion: lib BamWriter.o BamConversionMain.o\r
- $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc\r
-\r
-BamMerge: lib BamMerge.o\r
- $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamMerge.o $(LIBS) -L. -lbambc\r
+BamConversion: BamReader.o BamWriter.o BamConversionMain.o\r
+ $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamConversionMain.o $(LIBS)\r
\r
+BamMerge: BamReader.o BamWriter.o BamMerge.o\r
+ $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamMerge.o $(LIBS)\r
\r
clean:\r
- rm -fr gmon.out *.o *.a a.out $(PROG) BamConversion *~\r
+ rm -fr gmon.out *.o *.a a.out *~\r
README
----------------
-Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory.
-When you compile your program, add this to the g++ command:
+Significant re-write of BamReader API as of 5/18/2009:
+-------------------------------------------------------
+No longer using bgzf.h/.c
+No longer using libbambc.a library
+No longer using STL Utilities
+Minor changes to API syntax in opening BAM files with BamReader - see BamReader.h for details
- -lz -L. -lbambc
-and you should be good to go. (Those are lower case l's, as in 'lion'.)
+Compile instructions
+----------------------
+Short version:
+See Makefile for compilation example.
+
+Longer version:
+Copy BamAlignment.h, BamReader.* and/or BamWriter.* to working directory.
+#include "BamReader.h" and/or "BamWriter.h"
+Use standard compile commands, including the "-lz" parameter to access zlib functionality.
+(Note that "-L. -lbambc" are no longer necessary and WILL CAUSE PROBLEMS for compiler/linker )
+
+
+Feel free to email me or pop by my desk with any questions, comments, suggestions, etc.
+ - Derek
+
+++ /dev/null
-// STLUtilities.h\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 19 March 2009\r
-\r
-#ifndef STL_UTILITIES_H\r
-#define STL_UTILITIES_H\r
-\r
-#include <algorithm>\r
-using std::distance;\r
-using std::find;\r
-using std::transform;\r
-\r
-#include <functional>\r
-using std::unary_function;\r
-\r
-#include <iterator>\r
-using std::iterator_traits;\r
-\r
-#include <string>\r
-using std::string;\r
-\r
-#include <utility>\r
-using std::pair;\r
-\r
-#include <vector>\r
-using std::vector;\r
-\r
-// -------------------------------------------------------------------------------- //\r
-// This file contains a sample of STL tricks I've gathered along the way.\r
-//\r
-// Many thanks throughout to 'Effective' series by Scott Meyers.\r
-// Some code is lifted (almost) verbatim from his texts where applicable.\r
-// ------------------------------------------------------------------------------- //\r
-\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-// STL containers will delete the values they hold when the container is deleted or goes out of scope\r
-// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects ***\r
-// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new')\r
-// Usage example: for_each( container.begin(), container.end(), DeleteObject() ) \r
-//\r
-// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson =)\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-struct DeleteObject {\r
- template<typename T>\r
- void operator() (const T* p) const { \r
- delete p; \r
- }\r
-};\r
-\r
-template<typename T>\r
-void ClearPointerVector(vector<T>& v) {\r
- if ( !v.empty() ) {\r
- for_each(v.begin(), v.end(), DeleteObject());\r
- v.clear();\r
- }\r
-}\r
-\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-// Query a vector (other containers? havent tried) for an element\r
-// Returns the index of that element\r
-// Returns vector::size() if not found\r
-// Works with reverse iterators as well... index is counted backward from last element though\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-template < typename InputIterator, typename EqualityComparable >\r
-typename iterator_traits<InputIterator>::difference_type\r
-Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {\r
- return distance(begin, find(begin, end, item));\r
-}\r
-\r
-// ----------------------------------------------------------------------------------------------------------------------//\r
-// This next section is a sort of work-around for the bulky associative maps\r
-// The idea is to use a *SORTED* vector of pair objects as a lookup vector\r
-// LookupVector = vector< pair<Key, Value> > \r
-// ----------------------------------------------------------------------------------------------------------------------//\r
-\r
-// The following template classes allow a templatized comparison function for sorting & searching lookup vector\r
-\r
-// allows sorting by Key ( less<Key> )\r
-template <typename Key, typename Value>\r
-class LookupKeyCompare {\r
-\r
- typedef pair< Key, Value > LookupData;\r
- typedef typename LookupData::first_type Key_t;\r
- \r
- public:\r
- bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }\r
- bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }\r
- bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }\r
- private:\r
- bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }\r
-};\r
-\r
-// allows sorting by Value ( less<Value> )\r
-template <typename Key, typename Value>\r
-class LookupValueCompare {\r
-\r
- typedef pair< Key, Value > LookupData;\r
- typedef typename LookupData::second_type Value_t;\r
- \r
- public:\r
- bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); }\r
- bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); }\r
- bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); }\r
- private:\r
- bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; }\r
-};\r
-\r
-// The following template functions/structs allow you to pull all keys or all values from this lookup structure\r
-\r
-// pull Key from a data pair\r
-template<typename Key, typename Value>\r
-struct RipKey:\r
- public unary_function< pair<Key,Value>, Key> {\r
- Key operator() (pair<Key,Value> dataPair) { \r
- return dataPair.first; \r
- }\r
-};\r
-\r
-// pull Value from a data pair\r
-template<typename Key, typename Value>\r
-struct RipValue:\r
- public unary_function< pair<Key,Value>, Value> {\r
- Value operator() (pair<Key,Value> dataPair) { \r
- return dataPair.second; \r
- }\r
-};\r
-\r
-// pull all Keys from lookup, store in dest (overwrite contents of dest)\r
-template <typename Key, typename Value>\r
-void RipKeys( vector< pair<Key,Value> >& lookup, vector<Key>& dest) {\r
- dest.clear();\r
- dest.reserve(lookup.size());\r
- transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey<Key,Value>());\r
-}\r
-\r
-// pull all Values from lookup, store in dest (overwrite contents of dest)\r
-template <typename Key, typename Value>\r
-void RipValues( vector< pair<Key,Value> >& lookup, vector<Value>& dest) {\r
- dest.clear();\r
- dest.reserve(lookup.size());\r
- transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue<Key,Value>());\r
-}\r
-\r
-#endif /* STL_UTILITIES_H */\r
+++ /dev/null
-/*
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2008 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever.
- * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
- * or functionality.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "bgzf.h"
-
-#ifdef WIN32
-
-#else
-#include <unistd.h>
-extern off_t ftello(FILE *stream);
-extern int fseeko(FILE *stream, off_t offset, int whence);
-#endif
-
-typedef int8_t byte;
-
-static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
-static const int MAX_BLOCK_SIZE = 64 * 1024;
-
-static const int BLOCK_HEADER_LENGTH = 18;
-static const int BLOCK_FOOTER_LENGTH = 8;
-
-static const int GZIP_ID1 = 31;
-static const int GZIP_ID2 = 139;
-static const int CM_DEFLATE = 8;
-static const int FLG_FEXTRA = 4;
-static const int OS_UNKNOWN = 255;
-static const int BGZF_ID1 = 66; // 'B'
-static const int BGZF_ID2 = 67; // 'C'
-static const int BGZF_LEN = 2;
-static const int BGZF_XLEN = 6; // BGZF_LEN+4
-
-static const int GZIP_WINDOW_BITS = -15; // no zlib header
-static const int Z_DEFAULT_MEM_LEVEL = 8;
-
-
-inline
-void
-packInt16(uint8_t* buffer, uint16_t value)
-{
- buffer[0] = value;
- buffer[1] = value >> 8;
-}
-
-inline
-int
-unpackInt16(const uint8_t* buffer)
-{
- return (buffer[0] | (buffer[1] << 8));
-}
-
-inline
-void
-packInt32(uint8_t* buffer, uint32_t value)
-{
- buffer[0] = value;
- buffer[1] = value >> 8;
- buffer[2] = value >> 16;
- buffer[3] = value >> 24;
-}
-
-inline
-int
-min(int x, int y)
-{
- return (x < y) ? x : y;
-}
-
-static
-void
-report_error(BGZF* fp, const char* message) {
- fp->error = message;
-}
-
-static
-BGZF*
-open_read(int fd)
-{
- FILE* file = fdopen(fd, "r");
- BGZF* fp = (BGZF*)malloc(sizeof(BGZF));
- fp->file_descriptor = fd;
- fp->open_mode = 'r';
- fp->owned_file = 0;
- fp->file = file;
- fp->uncompressed_block_size = MAX_BLOCK_SIZE;
- fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
- fp->compressed_block_size = MAX_BLOCK_SIZE;
- fp->compressed_block = malloc(MAX_BLOCK_SIZE);
- fp->block_address = 0;
- fp->block_offset = 0;
- fp->block_length = 0;
- fp->error = NULL;
- return fp;
-}
-
-static
-BGZF*
-open_write(int fd)
-{
- FILE* file = fdopen(fd, "w");
- BGZF* fp = (BGZF*)malloc(sizeof(BGZF));
- fp->file_descriptor = fd;
- fp->open_mode = 'w';
- fp->owned_file = 0;
- fp->file = file;
- fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
- fp->uncompressed_block = NULL;
- fp->compressed_block_size = MAX_BLOCK_SIZE;
- fp->compressed_block = malloc(MAX_BLOCK_SIZE);
- fp->block_address = 0;
- fp->block_offset = 0;
- fp->block_length = 0;
- fp->error = NULL;
- return fp;
-}
-
-BGZF*
-bgzf_open(const char* __restrict path, const char* __restrict mode)
-{
- BGZF* fp = NULL;
- if (strcasecmp(mode, "r") == 0) {
- int oflag = O_RDONLY;
- int fd = open(path, oflag);
- fp = open_read(fd);
- } else if (strcasecmp(mode, "w") == 0) {
- int oflag = O_WRONLY | O_CREAT | O_TRUNC;
- int fd = open(path, oflag, 0644);
- fp = open_write(fd);
- }
- if (fp != NULL) {
- fp->owned_file = 1;
- }
- return fp;
-}
-
-BGZF*
-bgzf_fdopen(int fd, const char * __restrict mode)
-{
- if (strcasecmp(mode, "r") == 0) {
- return open_read(fd);
- } else if (strcasecmp(mode, "w") == 0) {
- return open_write(fd);
- } else {
- return NULL;
- }
-}
-
-static
-int
-deflate_block(BGZF* fp, int block_length)
-{
- // Deflate the block in fp->uncompressed_block into fp->compressed_block.
- // Also adds an extra field that stores the compressed block length.
-
- byte* buffer = (byte*)fp->compressed_block;
- int buffer_size = fp->compressed_block_size;
-
- // Init gzip header
- buffer[0] = GZIP_ID1;
- buffer[1] = GZIP_ID2;
- buffer[2] = CM_DEFLATE;
- buffer[3] = FLG_FEXTRA;
- buffer[4] = 0; // mtime
- buffer[5] = 0;
- buffer[6] = 0;
- buffer[7] = 0;
- buffer[8] = 0;
- buffer[9] = OS_UNKNOWN;
- buffer[10] = BGZF_XLEN;
- buffer[11] = 0;
- buffer[12] = BGZF_ID1;
- buffer[13] = BGZF_ID2;
- buffer[14] = BGZF_LEN;
- buffer[15] = 0;
- buffer[16] = 0; // placeholder for block length
- buffer[17] = 0;
-
- // loop to retry for blocks that do not compress enough
- int input_length = block_length;
- int compressed_length = 0;
- while (1) {
-
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)fp->uncompressed_block;
- zs.avail_in = input_length;
- zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
- zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
-
- int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
- GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
- if (status != Z_OK) {
- report_error(fp, "deflate init failed");
- return -1;
- }
- status = deflate(&zs, Z_FINISH);
- if (status != Z_STREAM_END) {
- deflateEnd(&zs);
- if (status == Z_OK) {
- // Not enough space in buffer.
- // Can happen in the rare case the input doesn't compress enough.
- // Reduce the amount of input until it fits.
- input_length -= 1024;
- if (input_length <= 0) {
- // should never happen
- report_error(fp, "input reduction failed");
- return -1;
- }
- continue;
- }
- report_error(fp, "deflate failed");
- return -1;
- }
- status = deflateEnd(&zs);
- if (status != Z_OK) {
- report_error(fp, "deflate end failed");
- return -1;
- }
- compressed_length = zs.total_out;
- compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
- if (compressed_length > MAX_BLOCK_SIZE) {
- // should never happen
- report_error(fp, "deflate overflow");
- return -1;
- }
- break;
- }
-
- packInt16((uint8_t*)&buffer[16], compressed_length-1);
- uint32_t crc = crc32(0L, NULL, 0L);
- crc = crc32(crc, (Bytef*)fp->uncompressed_block, input_length);
- packInt32((uint8_t*)&buffer[compressed_length-8], crc);
- packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
-
- int remaining = block_length - input_length;
- if (remaining > 0) {
- if (remaining > input_length) {
- // should never happen (check so we can use memcpy)
- report_error(fp, "remainder too large");
- return -1;
- }
- memcpy(fp->uncompressed_block,
- (char*)fp->uncompressed_block + input_length,
- remaining);
- }
- fp->block_offset = remaining;
- return compressed_length;
-}
-
-static
-int
-inflate_block(BGZF* fp, int block_length)
-{
- // Inflate the block in fp->compressed_block into fp->uncompressed_block
-
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)fp->compressed_block + 18;
- zs.avail_in = block_length - 16;
- zs.next_out = (Bytef*)fp->uncompressed_block;
- zs.avail_out = fp->uncompressed_block_size;
-
- int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
- if (status != Z_OK) {
- report_error(fp, "inflate init failed");
- return -1;
- }
- status = inflate(&zs, Z_FINISH);
- if (status != Z_STREAM_END) {
- inflateEnd(&zs);
- report_error(fp, "inflate failed");
- return -1;
- }
- status = inflateEnd(&zs);
- if (status != Z_OK) {
- report_error(fp, "inflate failed");
- return -1;
- }
- return zs.total_out;
-}
-
-static
-int
-check_header(const byte* header)
-{
- return (header[0] == GZIP_ID1 &&
- header[1] == (byte) GZIP_ID2 &&
- header[2] == Z_DEFLATED &&
- (header[3] & FLG_FEXTRA) != 0 &&
- unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
- header[12] == BGZF_ID1 &&
- header[13] == BGZF_ID2 &&
- unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
-}
-
-static
-int
-read_block(BGZF* fp)
-{
- byte header[BLOCK_HEADER_LENGTH];
- int64_t block_address = ftello(fp->file);
- int count = fread(header, 1, sizeof(header), fp->file);
- if (count == 0) {
- fp->block_length = 0;
- return 0;
- }
- if (count != sizeof(header)) {
- report_error(fp, "read failed");
- return -1;
- }
- if (!check_header(header)) {
- report_error(fp, "invalid block header");
- return -1;
- }
- int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
- byte* compressed_block = (byte*) fp->compressed_block;
- memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
- int remaining = block_length - BLOCK_HEADER_LENGTH;
- count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
- if (count != remaining) {
- report_error(fp, "read failed");
- return -1;
- }
- count = inflate_block(fp, block_length);
- if (count < 0) {
- return -1;
- }
- if (fp->block_length != 0) {
- // Do not reset offset if this read follows a seek.
- fp->block_offset = 0;
- }
- fp->block_address = block_address;
- fp->block_length = count;
- return 0;
-}
-
-int
-bgzf_read(BGZF* fp, void* data, int length)
-{
- if (length <= 0) {
- return 0;
- }
- if (fp->open_mode != 'r') {
- report_error(fp, "file not open for reading");
- return -1;
- }
-
- int bytes_read = 0;
- byte* output = (byte*)data;
- while (bytes_read < length) {
- int available = fp->block_length - fp->block_offset;
- if (available <= 0) {
- if (read_block(fp) != 0) {
- return -1;
- }
- available = fp->block_length - fp->block_offset;
- if (available <= 0) {
- break;
- }
- }
- int copy_length = min(length-bytes_read, available);
- byte* buffer = (byte*)fp->uncompressed_block;
- memcpy(output, buffer + fp->block_offset, copy_length);
- fp->block_offset += copy_length;
- output += copy_length;
- bytes_read += copy_length;
- }
- if (fp->block_offset == fp->block_length) {
- fp->block_address = ftello(fp->file);
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- return bytes_read;
-}
-
-static
-int
-flush_block(BGZF* fp)
-{
- while (fp->block_offset > 0) {
- int block_length = deflate_block(fp, fp->block_offset);
- if (block_length < 0) {
- return -1;
- }
- int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
- if (count != block_length) {
- report_error(fp, "write failed");
- return -1;
- }
- fp->block_address += block_length;
- }
- return 0;
-}
-
-int
-bgzf_write(BGZF* fp, const void* data, int length)
-{
- if (fp->open_mode != 'w') {
- report_error(fp, "file not open for writing");
- return -1;
- }
-
- if (fp->uncompressed_block == NULL) {
- fp->uncompressed_block = malloc(fp->uncompressed_block_size);
- }
-
- const byte* input = (byte*)data;
- int block_length = fp->uncompressed_block_size;
- int bytes_written = 0;
- while (bytes_written < length) {
- int copy_length = min(block_length - fp->block_offset, length - bytes_written);
- byte* buffer = (byte*)fp->uncompressed_block;
- memcpy(buffer + fp->block_offset, input, copy_length);
- fp->block_offset += copy_length;
- input += copy_length;
- bytes_written += copy_length;
- if (fp->block_offset == block_length) {
- if (flush_block(fp) != 0) {
- break;
- }
- }
- }
- return bytes_written;
-}
-
-int
-bgzf_close(BGZF* fp)
-{
- if (fp->open_mode == 'w') {
- if (flush_block(fp) != 0) {
- return -1;
- }
- if (fflush(fp->file) != 0) {
- report_error(fp, "flush failed");
- return -1;
- }
- }
- if (fp->owned_file) {
- if (fclose(fp->file) != 0) {
- return -1;
- }
- }
- free(fp->uncompressed_block);
- free(fp->compressed_block);
- free(fp);
- return 0;
-}
-
-int64_t
-bgzf_tell(BGZF* fp)
-{
- return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
-}
-
-int64_t
-bgzf_seek(BGZF* fp, int64_t pos, int where)
-{
- if (fp->open_mode != 'r') {
- report_error(fp, "file not open for read");
- return -1;
- }
- if (where != SEEK_SET) {
- report_error(fp, "unimplemented seek option");
- return -1;
- }
- int block_offset = pos & 0xFFFF;
- int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
- if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
- report_error(fp, "seek failed");
- return -1;
- }
- fp->block_length = 0; // indicates current block is not loaded
- fp->block_address = block_address;
- fp->block_offset = block_offset;
- return 0;
-}
-
+++ /dev/null
-/*
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2008 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever.
- * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
- * or functionality.
- */
-
-#ifndef __BGZF_H
-#define __BGZF_H
-
-#include <stdio.h>
-#include "zlib.h"
-
-#ifdef WIN32
-#include <io.h>
-
-typedef char int8_t;
-typedef unsigned char uint8_t;
-typedef short int16_t;
-typedef unsigned short uint16_t;
-typedef int int32_t;
-typedef unsigned int uint32_t;
-typedef long long int64_t;
-typedef unsigned long long uint64_t;
-
-#define ftello(a) _ftelli64(a)
-#define fseeko(a,b,c) _fseeki64(a,b,c)
-#define strcasecmp _stricmp
-#define open _open
-#define fdopen _fdopen
-#else
-#include <stdint.h>
-#include <stdbool.h>
-#endif
-
-typedef struct {
- int file_descriptor;
- char open_mode; // 'r' or 'w'
- bool owned_file;
- FILE* file;
- int uncompressed_block_size;
- int compressed_block_size;
- void* uncompressed_block;
- void* compressed_block;
- int64_t block_address;
- int block_length;
- int block_offset;
- const char* error;
-} BGZF;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Open an existing file descriptor for reading or writing.
- * Mode must be either "r" or "w".
- * A subsequent bgzf_close will not close the file descriptor.
- * Returns null on error.
- */
-BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
-
-/*
- * Open the specified file for reading or writing.
- * Mode must be either "r" or "w".
- * Returns null on error.
- */
-BGZF* bgzf_open(const char* path, const char* __restrict mode);
-
-/*
- * Close the BGZ file and free all associated resources.
- * Does not close the underlying file descriptor if created with bgzf_fdopen.
- * Returns zero on success, -1 on error.
- */
-int bgzf_close(BGZF* fp);
-
-/*
- * Read up to length bytes from the file storing into data.
- * Returns the number of bytes actually read.
- * Returns zero on end of file.
- * Returns -1 on error.
- */
-int bgzf_read(BGZF* fp, void* data, int length);
-
-/*
- * Write length bytes from data to the file.
- * Returns the number of bytes written.
- * Returns -1 on error.
- */
-int bgzf_write(BGZF* fp, const void* data, int length);
-
-/*
- * Return a virtual file pointer to the current location in the file.
- * No interpetation of the value should be made, other than a subsequent
- * call to bgzf_seek can be used to position the file at the same point.
- * Return value is non-negative on success.
- * Returns -1 on error.
- */
-int64_t bgzf_tell(BGZF* fp);
-
-/*
- * Set the file to read from the location specified by pos, which must
- * be a value previously returned by bgzf_tell for this file (but not
- * necessarily one returned by this file handle).
- * The where argument must be SEEK_SET.
- * Seeking on a file opened for write is not supported.
- * Returns zero on success, -1 on error.
- */
-int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif