From e38ea1095fe8f77bb1fa75a1e8c13c4b904280ae Mon Sep 17 00:00:00 2001 From: barnett Date: Mon, 18 May 2009 16:25:20 +0000 Subject: [PATCH] Major overhaul of BamReader. No longer relying on bgzf.* API. Sped up random-access. STLUtilities.h no longer needed git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@19 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- BamConversionMain.cpp | 9 +- BamReader.cpp | 866 +++++++++++++++++++++++------------------- BamReader.h | 501 ++++++++++++------------ BamReaderMain.cpp | 250 +++--------- BamWriter.h | 3 + Makefile | 41 +- README | 25 +- STLUtilities.h | 146 ------- bgzf.c | 493 ------------------------ bgzf.h | 119 ------ 10 files changed, 816 insertions(+), 1637 deletions(-) delete mode 100644 STLUtilities.h delete mode 100644 bgzf.c delete mode 100644 bgzf.h diff --git a/BamConversionMain.cpp b/BamConversionMain.cpp index 457e58e..0e6333b 100644 --- a/BamConversionMain.cpp +++ b/BamConversionMain.cpp @@ -17,12 +17,7 @@ int main(int argc, char* argv[]) { // open our BAM reader BamReader reader; - reader.SetFilename(inputFilename); - - if(!reader.Open()) { - cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl; - exit(1); - } + reader.Open(inputFilename); // retrieve the SAM header text string samHeader = reader.GetHeaderText(); @@ -43,4 +38,4 @@ int main(int argc, char* argv[]) { writer.Close(); return 0; -} \ No newline at end of file +} diff --git a/BamReader.cpp b/BamReader.cpp index f1c7061..b5f15ef 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -1,10 +1,16 @@ -// BamReader.cpp - -// Derek Barnett -// Marth Lab, Boston College -// Last modified: 23 April 2009 +// *************************************************************************** +// BamReader (c) 2009 Derek Barnett +// Marth Lab, Deptartment of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** #include "BamReader.h" +//#include "STLUtilities.h" + +#include + #include using std::cerr; using std::endl; @@ -13,194 +19,185 @@ using std::endl; const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; const char* BamReader::CIGAR_LOOKUP = "MIDNSHP"; -BamReader::BamReader(const char* filename, const char* indexFilename) - : m_filename( (char*)filename ) - , m_indexFilename( (char*)indexFilename ) - , m_file(0) - , m_index(NULL) - , m_headerText("") - , m_isOpen(false) +// constructor +BamReader::BamReader(void) + : m_index(NULL) , m_isIndexLoaded(false) + , m_alignmentsBeginOffset(0) , m_isRegionSpecified(false) - , m_isCalculateAlignedBases(true) , m_currentRefID(0) , m_currentLeft(0) - , m_alignmentsBeginOffset(0) -{ - Open(); -} +{ } +// destructor BamReader::~BamReader(void) { - - // close file - if ( m_isOpen ) { Close(); } + m_headerText.clear(); + ClearIndex(); + Close(); } -// open BAM file -bool BamReader::Open(void) { - - if (!m_isOpen && m_filename != NULL ) { - - // open file - m_file = bam_open(m_filename, "r"); - - // get header info && index info - if ( (m_file != NULL) && LoadHeader() ) { - - // save file offset where alignments start - m_alignmentsBeginOffset = bam_tell(m_file); - - // set open flag - m_isOpen = true; - } - - // try to open (and load) index data, if index file given - if ( m_indexFilename != NULL ) { - OpenIndex(); - } - } - - return m_isOpen; +// checks BGZF block header +bool BamReader::BgzfCheckBlockHeader(char* header) { + + return (header[0] == GZIP_ID1 && + header[1] == (char)GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN); } -bool BamReader::OpenIndex(void) { - - if ( m_indexFilename && !m_isIndexLoaded ) { - m_isIndexLoaded = LoadIndex(); - } - return m_isIndexLoaded; +// closes the BAM file +void BamReader::BgzfClose(void) { + m_BGZF.IsOpen = false; + fclose(m_BGZF.Stream); } -// close BAM file -bool BamReader::Close(void) { +// de-compresses the current block +int BamReader::BgzfInflateBlock(int blockLength) { - if (m_isOpen) { + // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)m_BGZF.CompressedBlock + 18; + zs.avail_in = blockLength - 16; + zs.next_out = (Bytef*)m_BGZF.UncompressedBlock; + zs.avail_out = m_BGZF.UncompressedBlockSize; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + printf("inflateInit failed\n"); + exit(1); + } - // close file - int ret = bam_close(m_file); - - // delete index info - if ( m_index != NULL) { delete m_index; } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + printf("inflate failed\n"); + exit(1); + } - // clear open flag - m_isOpen = false; + status = inflateEnd(&zs); + if (status != Z_OK) { + printf("inflateEnd failed\n"); + exit(1); + } - // clear index flag - m_isIndexLoaded = false; + return zs.total_out; +} - // clear region flag - m_isRegionSpecified = false; +// opens the BAM file for reading +void BamReader::BgzfOpen(const string& filename) { - // return success/fail of bam_close - return (ret == 0); - } + m_BGZF.Stream = fopen(filename.c_str(), "rb"); + if(!m_BGZF.Stream) { + cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl; + exit(1); + } - return true; + m_BGZF.IsOpen = true; } -// get BAM filename -const char* BamReader::Filename(void) const { - return (const char*)m_filename; -} +// reads BGZF data into buffer +unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) { -// set BAM filename -void BamReader::SetFilename(const char* filename) { - m_filename = (char*)filename; -} + if (dataLength == 0) { return 0; } -// get BAM Index filename -const char* BamReader::IndexFilename(void) const { - return (const char*)m_indexFilename; -} + char* output = data; + unsigned int numBytesRead = 0; + while (numBytesRead < dataLength) { -// set BAM Index filename -void BamReader::SetIndexFilename(const char* indexFilename) { - m_indexFilename = (char*)indexFilename; -} + int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset; + if (bytesAvailable <= 0) { + if ( BgzfReadBlock() != 0 ) { return -1; } + bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset; + if ( bytesAvailable <= 0 ) { break; } + } -// return full header text -const string BamReader::GetHeaderText(void) const { - return m_headerText; -} + char* buffer = m_BGZF.UncompressedBlock; + int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); + memcpy(output, buffer + m_BGZF.BlockOffset, copyLength); -// return number of reference sequences in BAM file -const int BamReader::GetReferenceCount(void) const { - return m_references.size(); -} + m_BGZF.BlockOffset += copyLength; + output += copyLength; + numBytesRead += copyLength; + } -// get RefID from reference name -const int BamReader::GetRefID(string refName) const { - - vector refNames; - RefVector::const_iterator refIter = m_references.begin(); - RefVector::const_iterator refEnd = m_references.end(); - for ( ; refIter != refEnd; ++refIter) { - refNames.push_back( (*refIter).RefName ); + if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) { + m_BGZF.BlockAddress = ftello64(m_BGZF.Stream); + m_BGZF.BlockOffset = 0; + m_BGZF.BlockLength = 0; } - // return 'index-of' refName (if not found, returns refNames.size()) - return Index( refNames.begin(), refNames.end(), refName ); + return numBytesRead; } -const RefVector BamReader::GetReferenceData(void) const { - return m_references; -} +int BamReader::BgzfReadBlock(void) { -bool BamReader::Jump(int refID, unsigned int left) { + char header[BLOCK_HEADER_LENGTH]; + int64_t blockAddress = ftello(m_BGZF.Stream); - // if index available, and region is valid - if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { - m_currentRefID = refID; - m_currentLeft = left; - m_isRegionSpecified = true; - - int64_t offset = GetOffset(m_currentRefID, m_currentLeft); - if ( offset == -1 ) { return false; } - else { return ( bam_seek(m_file, offset, SEEK_SET) == 0 ); } - } - return false; -} + int count = fread(header, 1, sizeof(header), m_BGZF.Stream); + if (count == 0) { + m_BGZF.BlockLength = 0; + return 0; + } -bool BamReader::Rewind(void) { + if (count != sizeof(header)) { + printf("read block failed - count != sizeof(header)\n"); + return -1; + } - int refID = 0; - int refCount = m_references.size(); - for ( ; refID < refCount; ++refID ) { - if ( m_references.at(refID).RefHasAlignments ) { break; } - } + if (!BgzfCheckBlockHeader(header)) { + printf("read block failed - CheckBgzfBlockHeader() returned false\n"); + return -1; + } - m_currentRefID = refID; - m_currentLeft = 0; - m_isRegionSpecified = false; + int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1; + char* compressedBlock = m_BGZF.CompressedBlock; + memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); + int remaining = blockLength - BLOCK_HEADER_LENGTH; - return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 ); -} + count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream); + if (count != remaining) { + printf("read block failed - count != remaining\n"); + return -1; + } -// get next alignment from specified region -bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { + count = BgzfInflateBlock(blockLength); + if (count < 0) { return -1; } - // try to load 'next' read - if ( LoadNextAlignment(bAlignment) ) { + if (m_BGZF.BlockLength != 0) { + m_BGZF.BlockOffset = 0; + } - // if specified region, check for region overlap - if ( m_isRegionSpecified ) { + m_BGZF.BlockAddress = blockAddress; + m_BGZF.BlockLength = count; + return 0; +} - // if overlap, return true - if ( IsOverlap(bAlignment) ) { return true; } - // if not, try the next alignment - else { return GetNextAlignment(bAlignment); } - } +// move file pointer to specified offset +bool BamReader::BgzfSeek(int64_t position) { - // not using region, valid read detected, return success - else { return true; } - } + int blockOffset = (position & 0xFFFF); + int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; + if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) { + printf("ERROR: Unable to seek in BAM file\n"); + exit(1); + } - // no valid alignment to load - return false; + m_BGZF.BlockLength = 0; + m_BGZF.BlockAddress = blockAddress; + m_BGZF.BlockOffset = blockOffset; + return true; } -void BamReader::SetCalculateAlignedBases(bool flag) { - m_isCalculateAlignedBases = flag; +// get file position in BAM file +int64_t BamReader::BgzfTell(void) { + return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) ); } int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) { @@ -225,108 +222,152 @@ int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BI return i; } -uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { +unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { // initialize alignment end to starting position - uint32_t alignEnd = position; + unsigned int alignEnd = position; // iterate over cigar operations vector::const_iterator cigarIter = cigarData.begin(); vector::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter) { - if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') { + char cigarType = (*cigarIter).Type; + if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { alignEnd += (*cigarIter).Length; } } return alignEnd; } -int64_t BamReader::GetOffset(int refID, unsigned int left) { +void BamReader::ClearIndex(void) { + + if ( m_index ) { + // iterate over references + vector::iterator refIter = m_index->begin(); + vector::iterator refEnd = m_index->end(); + for ( ; refIter != refEnd; ++refIter) { + RefIndex* aRef = (*refIter); + if ( aRef ) { + // clear out BAM bins + if ( aRef->first ) { + BinVector::iterator binIter = (aRef->first)->begin(); + BinVector::iterator binEnd = (aRef->first)->end(); + for ( ; binIter != binEnd; ++binIter ) { + ChunkVector* chunks = (*binIter).second; + if ( chunks ) { delete chunks; } + } + delete aRef->first; + } + // clear BAM linear offsets + if ( aRef->second ) { delete aRef->second; } + delete aRef; + } + } + delete m_index; + } +} - // make space for bins - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - - // returns number of bins overlapping (left, right) - // stores indices of those bins in 'bins' - int numBins = BinsFromRegion(refID, left, bins); +// closes the BAM file +void BamReader::Close(void) { + if(m_BGZF.IsOpen) { BgzfClose(); } + if(m_index) { delete m_index; m_index = NULL; } + m_isRegionSpecified = false; +} - // access index data for refID - RefIndex* refIndex = m_index->at(refID); +const string BamReader::GetHeaderText(void) const { + return m_headerText; +} - // get list of BAM bins for this reference sequence - BinVector* refBins = refIndex->first; +const int BamReader::GetReferenceCount(void) const { + return m_references.size(); +} - sort( refBins->begin(), refBins->end(), LookupKeyCompare() ); +const RefVector BamReader::GetReferenceData(void) const { + return m_references; +} - // declare ChunkVector - ChunkVector regionChunks; +const int BamReader::GetReferenceID(const string& refName) const { - // declaure LinearOffsetVector - LinearOffsetVector* linearOffsets = refIndex->second; + // retrieve names from reference data + vector refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for ( ; refIter != refEnd; ++refIter) { + refNames.push_back( (*refIter).RefName ); + } - // some sort of linear offset vs bin index voodoo... not completely sure what's going here - uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT); + // return 'index-of' refName ( if not found, returns refNames.size() ) + return Index( refNames.begin(), refNames.end(), refName ); +} - BinVector::iterator binBegin = refBins->begin(); - BinVector::iterator binEnd = refBins->end(); +// get next alignment (from specified region, if given) +bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { - // iterate over bins overlapping region, count chunks - for (int i = 0; i < numBins; ++i) { + // if valid alignment available + if ( LoadNextAlignment(bAlignment) ) { - // look for bin with ID=bin[i] - BinVector::iterator binIter = binBegin; - - for ( ; binIter != binEnd; ++binIter ) { + // if region not specified, return success + if ( !m_isRegionSpecified ) { return true; } - // if found, increment n_off by number of chunks for each bin - if ( (*binIter).first == (uint32_t)bins[i] ) { - - // iterate over chunks in that bin - ChunkVector* binChunks = (*binIter).second; - - ChunkVector::iterator chunkIter = binChunks->begin(); - ChunkVector::iterator chunkEnd = binChunks->end(); - for ( ; chunkIter != chunkEnd; ++chunkIter) { - - // if right bound of pair is greater than min_off (linear offset value), store pair - if ( (*chunkIter).second > minOffset) { - regionChunks.push_back( (*chunkIter) ); - } - } - } + // load next alignment until region overlap is found + while ( !IsOverlap(bAlignment) ) { + // if no valid alignment available (likely EOF) return failure + if ( !LoadNextAlignment(bAlignment) ) { return false; } } - } + + // return success (alignment found that overlaps region) + return true; + } + + // no valid alignment + else { return false; } +} - // clean up temp array - free(bins); +int64_t BamReader::GetOffset(int refID, unsigned int left) { - // there should be at least 1 - if(regionChunks.size() == 0) { return -1; } + // calculate which bins overlap this region + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + int numBins = BinsFromRegion(refID, left, bins); - // sort chunks by start position - sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare() ); + // get bins for this reference + RefIndex* refIndex = m_index->at(refID); + BinVector* refBins = refIndex->first; - // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing - int numOffsets = regionChunks.size(); - for (int i = 1; i < numOffsets; ++i) { - if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) { - regionChunks.at(i-1).second = regionChunks.at(i).first; - } - } + // get minimum offset to consider + LinearOffsetVector* linearOffsets = refIndex->second; + uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT); + + // store offsets to beginning of alignment 'chunks' + std::vector chunkStarts; - // merge adjacent chunks - int l = 0; - for (int i = 1; i < numOffsets; ++i) { - // if adjacent, expand boundaries of (merged) chunk - if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) { - regionChunks.at(l).second = regionChunks.at(i).second; + // reference bin iterators + BinVector::const_iterator binIter; + BinVector::const_iterator binBegin = refBins->begin(); + BinVector::const_iterator binEnd = refBins->end(); + + // store all alignment 'chunk' starts for bins in this region + for (int i = 0; i < numBins; ++i ) { + binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare() ); + if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) { + ChunkVector* binChunks = (*binIter).second; + ChunkVector::const_iterator chunkIter = binChunks->begin(); + ChunkVector::const_iterator chunkEnd = binChunks->end(); + for ( ; chunkIter != chunkEnd; ++chunkIter) { + if ( (*chunkIter).second > minOffset ) { + chunkStarts.push_back( (*chunkIter).first ); + } + } } - // else, move on to next (merged) chunk index - else { regionChunks.at(++l) = regionChunks.at(i); } } - - // return beginning file offset of first chunk for region - return (int64_t)regionChunks.at(0).first; + + // clean up memory + free(bins); + + // if no alignments found + if ( chunkStarts.empty() ) { return -1; } + + // else return smallest offset for alignment starts + else { return *min_element(chunkStarts.begin(), chunkStarts.end()); } } bool BamReader::IsOverlap(BamAlignment& bAlignment) { @@ -337,92 +378,65 @@ bool BamReader::IsOverlap(BamAlignment& bAlignment) { // read starts after left boundary if ( bAlignment.Position >= m_currentLeft) { return true; } - // get alignment end - uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData); - // return whether alignment end overlaps left boundary - return ( alignEnd >= m_currentLeft ); + return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft ); } -bool BamReader::LoadHeader(void) { +bool BamReader::Jump(int refID, unsigned int left) { + + // if index available, and region is valid + if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { + m_currentRefID = refID; + m_currentLeft = left; + m_isRegionSpecified = true; + + int64_t offset = GetOffset(m_currentRefID, m_currentLeft); + if ( offset == -1 ) { return false; } + else { return BgzfSeek(offset); } + } + return false; +} +void BamReader::LoadHeaderData(void) { + // check to see if proper BAM header - char buf[4]; - if (bam_read(m_file, buf, 4) != 4) { return false; } - if (strncmp(buf, "BAM\001", 4)) { - fprintf(stderr, "wrong header type!\n"); - return false; + char buffer[4]; + if (BgzfRead(buffer, 4) != 4) { + printf("Could not read header type\n"); + exit(1); + } + if (strncmp(buffer, "BAM\001", 4)) { + printf("wrong header type!\n"); + exit(1); } // get BAM header text length - int32_t headerTextLength; - bam_read(m_file, &headerTextLength, 4); + BgzfRead(buffer, 4); + const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer); // get BAM header text char* headerText = (char*)calloc(headerTextLength + 1, 1); - bam_read(m_file, headerText, headerTextLength); + BgzfRead(headerText, headerTextLength); m_headerText = (string)((const char*)headerText); // clean up calloc-ed temp variable free(headerText); - - // get number of reference sequences - int32_t numberRefSeqs; - bam_read(m_file, &numberRefSeqs, 4); - if (numberRefSeqs == 0) { return false; } - - m_references.reserve((int)numberRefSeqs); - - // reference variables - int32_t refNameLength; - char* refName; - uint32_t refLength; - - // iterate over all references in header - for (int i = 0; i != numberRefSeqs; ++i) { - - // get length of reference name - bam_read(m_file, &refNameLength, 4); - refName = (char*)calloc(refNameLength, 1); - - // get reference name and reference sequence length - bam_read(m_file, refName, refNameLength); - bam_read(m_file, &refLength, 4); - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName); - aReference.RefLength = refLength; - m_references.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); - } - - return true; } -bool BamReader::LoadIndex(void) { - - // check to see if index file exists - FILE* indexFile; - if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) { - fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n"); - return false; - } +void BamReader::LoadIndexData(FILE* indexStream) { // see if index is valid BAM index char magic[4]; - fread(magic, 1, 4, indexFile); + fread(magic, 1, 4, indexStream); if (strncmp(magic, "BAI\1", 4)) { - fprintf(stderr, "Problem with index - wrong \'magic\' number.\n"); - fclose(indexFile); - return false; + printf("Problem with index file - invalid format.\n"); + fclose(indexStream); + exit(1); } // get number of reference sequences uint32_t numRefSeqs; - fread(&numRefSeqs, 4, 1, indexFile); + fread(&numRefSeqs, 4, 1, indexStream); // intialize BamIndex data structure m_index = new BamIndex; @@ -433,7 +447,7 @@ bool BamReader::LoadIndex(void) { // get number of bins for this reference sequence int32_t numBins; - fread(&numBins, 4, 1, indexFile); + fread(&numBins, 4, 1, indexStream); if (numBins > 0) { m_references.at(i).RefHasAlignments = true; } @@ -446,11 +460,11 @@ bool BamReader::LoadIndex(void) { // get binID uint32_t binID; - fread(&binID, 4, 1, indexFile); + fread(&binID, 4, 1, indexStream); // get number of regionChunks in this bin uint32_t numChunks; - fread(&numChunks, 4, 1, indexFile); + fread(&numChunks, 4, 1, indexStream); // intialize ChunkVector ChunkVector* regionChunks = new ChunkVector; @@ -462,22 +476,28 @@ bool BamReader::LoadIndex(void) { // get chunk boundaries (left, right) uint64_t left; uint64_t right; - fread(&left, 8, 1, indexFile); - fread(&right, 8, 1, indexFile); + fread(&left, 8, 1, indexStream); + fread(&right, 8, 1, indexStream); // save ChunkPair regionChunks->push_back( ChunkPair(left, right) ); } + // sort chunks for this bin + sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare() ); + // save binID, chunkVector for this bin bins->push_back( BamBin(binID, regionChunks) ); } + // sort bins by binID + sort(bins->begin(), bins->end(), LookupKeyCompare() ); + // load linear index for this reference sequence // get number of linear offsets int32_t numLinearOffsets; - fread(&numLinearOffsets, 4, 1, indexFile); + fread(&numLinearOffsets, 4, 1, indexStream); // intialize LinearOffsetVector LinearOffsetVector* linearOffsets = new LinearOffsetVector; @@ -487,179 +507,233 @@ bool BamReader::LoadIndex(void) { for (int j = 0; j < numLinearOffsets; ++j) { // get a linear offset uint64_t linearOffset; - fread(&linearOffset, 8, 1, indexFile); + fread(&linearOffset, 8, 1, indexStream); // store linear offset linearOffsets->push_back(linearOffset); } + // sort linear offsets + sort( linearOffsets->begin(), linearOffsets->end() ); + // store index data for that reference sequence m_index->push_back( new RefIndex(bins, linearOffsets) ); } // close index file (.bai) and return - fclose(indexFile); - return true; + fclose(indexStream); } bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) { - // check valid alignment block header data - int32_t block_len; - int32_t ret; - uint32_t x[8]; - - int32_t bytesRead = 0; - // read in the 'block length' value, make sure it's not zero - if ( (ret = bam_read(m_file, &block_len, 4)) == 0 ) { return false; } - bytesRead += 4; + char buffer[4]; + BgzfRead(buffer, 4); + const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer); + if ( blockLength == 0 ) { return false; } + + // keep track of bytes read as method progresses + int bytesRead = 4; - // read in core alignment data, make the right size of data was read - if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } + // read in core alignment data, make sure the right size of data was read + char x[BAM_CORE_SIZE]; + if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } bytesRead += BAM_CORE_SIZE; - // set BamAlignment 'core' data - bAlignment.RefID = x[0]; - bAlignment.Position = x[1]; - bAlignment.Bin = x[2]>>16; - bAlignment.MapQuality = x[2]>>8&0xff; - bAlignment.AlignmentFlag = x[3]>>16; - bAlignment.MateRefID = x[5]; - bAlignment.MatePosition = x[6]; - bAlignment.InsertSize = x[7]; - - // fetch & store often-used lengths for character data parsing - unsigned int queryNameLength = x[2]&0xff; - unsigned int numCigarOperations = x[3]&0xffff; - unsigned int querySequenceLength = x[4]; + // set BamAlignment 'core' data and character data lengths + unsigned int tempValue; + unsigned int queryNameLength; + unsigned int numCigarOperations; + unsigned int querySequenceLength; + + bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]); + bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]); + + tempValue = BgzfUnpackUnsignedInt(&x[8]); + bAlignment.Bin = tempValue >> 16; + bAlignment.MapQuality = tempValue >> 8 & 0xff; + queryNameLength = tempValue & 0xff; + + tempValue = BgzfUnpackUnsignedInt(&x[12]); + bAlignment.AlignmentFlag = tempValue >> 16; + numCigarOperations = tempValue & 0xffff; + + querySequenceLength = BgzfUnpackUnsignedInt(&x[16]); + bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]); + bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]); + bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]); + + // calculate lengths/offsets + const unsigned int dataLength = blockLength - BAM_CORE_SIZE; + const unsigned int cigarDataOffset = queryNameLength; + const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + querySequenceLength; + const unsigned int tagDataLen = dataLength - tagDataOffset; - // get length of character data - int dataLength = block_len - BAM_CORE_SIZE; - // set up destination buffers for character data - uint8_t* allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength); - uint32_t* cigarData = (uint32_t*)(allCharData+queryNameLength); - - const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength; - const unsigned int tagDataLen = dataLength - tagDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; + char* allCharData = (char*)calloc(sizeof(char), dataLength); + uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + char* seqData = ((char*)allCharData) + seqDataOffset; + char* qualData = ((char*)allCharData) + qualDataOffset; + char* tagData = ((char*)allCharData) + tagDataOffset; // get character data - make sure proper data size was read - if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; } + if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; } else { - + bytesRead += dataLength; - - // clear out bases, qualities, aligned bases, CIGAR, and tag data + + // clear out any previous string data + bAlignment.Name.clear(); bAlignment.QueryBases.clear(); bAlignment.Qualities.clear(); bAlignment.AlignedBases.clear(); bAlignment.CigarData.clear(); bAlignment.TagData.clear(); - + // save name bAlignment.Name = (string)((const char*)(allCharData)); - // save bases - char singleBase[2]; - uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength); - for (unsigned int i = 0; i < querySequenceLength; ++i) { - // numeric to char conversion - sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] ); - // append character data to Bases - bAlignment.QueryBases.append( (const char*)singleBase ); + // save query sequence + for (unsigned int i = 0; i < querySequenceLength; ++i) { + char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + bAlignment.QueryBases.append( 1, singleBase ); } // save sequence length bAlignment.Length = bAlignment.QueryBases.length(); // save qualities - char singleQuality[4]; - uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2); for (unsigned int i = 0; i < querySequenceLength; ++i) { - // numeric to char conversion - sprintf( singleQuality, "%c", ( t[i]+33 ) ); - // append character data to Qualities - bAlignment.Qualities.append( (const char*)singleQuality ); + char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character + bAlignment.Qualities.append( 1, singleQuality ); } // save CIGAR-related data; int k = 0; for (unsigned int i = 0; i < numCigarOperations; ++i) { - + // build CigarOp struct CigarOp op; op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - + // save CigarOp bAlignment.CigarData.push_back(op); - - // can skip this step if user wants to ignore - if (m_isCalculateAlignedBases) { - - // build AlignedBases string - switch (op.Type) { - - case ('M') : - case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases - case ('S') : k += op.Length; // for 'S' - skip over query bases - break; - - case ('D') : - case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding; - break; - - case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence - k += op.Length; - break; - - case ('H') : break; // for 'H' - do nothing, move to next op - - default : assert(false); // shouldn't get here - break; - } + + // build AlignedBases string + switch (op.Type) { + + case ('M') : + case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases + case ('S') : k += op.Length; // for 'S' - skip over query bases + break; + + case ('D') : + case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding; + break; + + case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence + k += op.Length; + break; + + case ('H') : break; // for 'H' - do nothing, move to next op + + default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); } } - + // read in the tag data bAlignment.TagData.resize(tagDataLen); memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen); } + free(allCharData); + return true; +} -/* - // (optional) read tag parsing data - string tag; - char data; - int i = 0; +void BamReader::LoadReferenceData(void) { - // still data to read (auxiliary tags) - while ( bytesRead < block_len ) { + // get number of reference sequences + char buffer[4]; + BgzfRead(buffer, 4); + const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer); + if (numberRefSeqs == 0) { return; } + m_references.reserve((int)numberRefSeqs); + + // iterate over all references in header + for (unsigned int i = 0; i != numberRefSeqs; ++i) { - if ( bam_read(m_file, &data, 1) == 1 ) { - - ++bytesRead; + // get length of reference name + BgzfRead(buffer, 4); + const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer); + char* refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + BgzfRead(refName, refNameLength); + BgzfRead(buffer, 4); + const unsigned int refLength = BgzfUnpackUnsignedInt(buffer); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + m_references.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } +} - if (bytesRead == block_len && data != '\0') { - fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n"); - exit(1); - } +// opens BAM file (and index) +void BamReader::Open(const string& filename, const string& indexFilename) { - tag.append(1, data); - if ( data == '\0' ) { - bAlignment.Tags.push_back(tag); - tag = ""; - i = 0; - } else { - if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); } - ++i; - } - } else { - fprintf(stderr, "ERROR: Could not read tag info.\n"); + // open the BGZF file for reading, retrieve header text & reference data + BgzfOpen(filename); + LoadHeaderData(); + LoadReferenceData(); + + // store file offset of first alignment + m_alignmentsBeginOffset = BgzfTell(); + + // open index file & load index data (if exists) + OpenIndex(indexFilename); +} + +void BamReader::OpenIndex(const string& indexFilename) { + + // if index file exists + if (!indexFilename.empty()) { + + // open index + FILE* indexStream = fopen(indexFilename.c_str(), "rb"); + + // abort on error + if(!indexStream) { + cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl; exit(1); } + + // build up index data structure + LoadIndexData(indexStream); } -*/ - return true; } + +bool BamReader::Rewind(void) { + + // find first reference that has alignments in the BAM file + int refID = 0; + int refCount = m_references.size(); + for ( ; refID < refCount; ++refID ) { + if ( m_references.at(refID).RefHasAlignments ) { break; } + } + + // store default bounds for first alignment + m_currentRefID = refID; + m_currentLeft = 0; + m_isRegionSpecified = false; + + // return success/failure of seek + return BgzfSeek(m_alignmentsBeginOffset); +} diff --git a/BamReader.h b/BamReader.h index e2b4598..17cb86a 100644 --- a/BamReader.h +++ b/BamReader.h @@ -1,242 +1,259 @@ -// BamReader.h - -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package - (thus the carryover of above MIT license) - Contact: Derek Barnett -*/ - -// Derek Barnett -// Marth Lab, Boston College -// Last modified: 23 April 2009 - -#ifndef BAMREADER_H -#define BAMREADER_H - -// custom includes -#include "BamAlignment.h" -#include "STLUtilities.h" - -// C library includes -#include -#include -#include -#include - -#ifdef WIN32 -typedef char int8_t; -typedef unsigned char uint8_t; -typedef short int16_t; -typedef unsigned short uint16_t; -typedef int int32_t; -typedef unsigned int uint32_t; -typedef long long int64_t; -typedef unsigned long long uint64_t; -#else -#include -#endif - -// BGZF library includes/defines -#include "bgzf.h" -typedef BGZF* BamFile; -#define bam_open(f_name, mode) bgzf_open(f_name, mode) -#define bam_close(f_ptr) bgzf_close(f_ptr) -#define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size) -#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size) -#define bam_tell(f_ptr) bgzf_tell(f_ptr) -#define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir) - -// size of alignment data block in BAM file (bytes) -#define BAM_CORE_SIZE 32 - -// BAM indexing constants -#define MAX_BIN 37450 // =(8^6-1)/7+1 -#define BAM_MIN_CHUNK_GAP 32768 -#define BAM_LIDX_SHIFT 14 - -// CIGAR-retrieval mask/shift constants -#define BAM_CIGAR_SHIFT 4 -#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) - -// CIGAR-operation types -#define BAM_CMATCH 0 -#define BAM_CINS 1 -#define BAM_CDEL 2 -#define BAM_CREF_SKIP 3 -#define BAM_CSOFT_CLIP 4 -#define BAM_CHARD_CLIP 5 -#define BAM_CPAD 6 - -// --------------------------- // -// Bam header info -// --------------------------- // - -// --------------------------- // -// BamIndex-related typedefs -// --------------------------- // - -// offset for linear indexing -typedef vector LinearOffsetVector; - -// chunk boundaries -typedef pair ChunkPair; -// list of chunks in a BAM bin -typedef vector ChunkVector; - -// BAM bins for a reference sequence -// replaces khash - uint32_t is key, ChunkVector is value -typedef pair BamBin; -typedef vector BinVector; - -// each reference sequence has a BinVector and LinearOffsetVector -typedef pair RefIndex; - -// full BamIndex defined as: -typedef vector BamIndex; - -// ---------------------------------------------------------------------------// - -class BamReader { - - public: - // constructors - BamReader(const char* fileName = NULL, const char* indexFilename = NULL); - - public: - // destructor - ~BamReader(void); - - // BAM interface methods - public: - - // ----------------------- // - // File manipulation - // ----------------------- // - - // open BAM file (automatically opens index if provided) - bool Open(void); - - // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened) - bool OpenIndex(void); - - // close BAM file - bool Close(void); - - // get BAM filename - const char* Filename(void) const; - - // set BAM filename - void SetFilename(const char*); - - // get BAM Index filename - const char* IndexFilename(void) const; - - // set BAM Index filename - void SetIndexFilename(const char*); - - // ----------------------- // - // Access BAM header - // ----------------------- // - - // return full header text - const string GetHeaderText(void) const; - - // --------------------------------- // - // Access reference sequence info - // --------------------------------- // - - // return number of reference sequences in BAM file - const int GetReferenceCount(void) const; - - // return vector of RefData entries - const RefVector GetReferenceData(void) const; - - // get refID from reference name - const int GetRefID(string refName) const; - - // ----------------------------------------- // - // File position moving - // ----------------------------------------- // - - // jumps to 'left' position on refID - // actually jumps before position, so reads that overlap 'left' are included as well - // 'left' defaults to reference begin if not specified - bool Jump(int refID, unsigned int left = 0); - - // Jump to beginning of BAM file, clears any region previously set by Jump() - bool Rewind(void); - - // ------------------------------ // - // Access alignments - // ------------------------------ // - - // get next alignment - bool GetNextAlignment(BamAlignment& read); - - // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded - void SetCalculateAlignedBases(bool); - - // internal utility methods - private: - int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); - uint32_t CalculateAlignmentEnd(const unsigned int&, const vector&); - int64_t GetOffset(int, unsigned int); - bool IsOverlap(BamAlignment&); - bool LoadHeader(void); - bool LoadIndex(void); - bool LoadNextAlignment(BamAlignment&); - - private: - // main BAM reader components - char* m_filename; - char* m_indexFilename; - BamFile m_file; - BamIndex* m_index; - RefVector m_references; - string m_headerText; - - // state flags - bool m_isOpen; // BAM file is open for processing - bool m_isIndexLoaded; // BAM Index data is loaded and available for processing - bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump() - bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true) - - // region values - int m_currentRefID; - unsigned int m_currentLeft; - - // file offset of 1st read in BAM file - int64_t m_alignmentsBeginOffset; - - private: - // BAM character constants - static const char* DNA_LOOKUP; - static const char* CIGAR_LOOKUP; -}; - -#endif /* BAMREADER_H */ +// *************************************************************************** +// BamReader (c) 2009 Derek Barnett +// Marth Lab, Deptartment of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +using namespace std; + +#include "BamAlignment.h" + +// our zlib constants +#define GZIP_ID1 31 +#define GZIP_ID2 139 +#define CM_DEFLATE 8 +#define FLG_FEXTRA 4 +#define OS_UNKNOWN 255 +#define BGZF_XLEN 6 +#define BGZF_ID1 66 +#define BGZF_ID2 67 +#define BGZF_LEN 2 +#define GZIP_WINDOW_BITS -15 +#define Z_DEFAULT_MEM_LEVEL 8 + +// our BZGF constants +#define BLOCK_HEADER_LENGTH 18 +#define BLOCK_FOOTER_LENGTH 8 +#define MAX_BLOCK_SIZE 65536 +#define DEFAULT_BLOCK_SIZE 65536 + +// our BAM constants +#define BAM_CORE_SIZE 32 +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +// BAM indexing constants +#define MAX_BIN 37450 // =(8^6-1)/7+1 +#define BAM_MIN_CHUNK_GAP 32768 +#define BAM_LIDX_SHIFT 14 + +// our variable sizes +#define SIZEOF_INT 4 + +// define our BZGF structure +#ifndef BGZF_DATA +#define BGZF_DATA +struct BgzfData { + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; + + // constructor + BgzfData(void) + : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) + { + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch(bad_alloc&) { + printf("ERROR: Unable to allocate memory for our BGZF object.\n"); + exit(1); + } + } + + // destructor + ~BgzfData(void) { + if(CompressedBlock) delete [] CompressedBlock; + if(UncompressedBlock) delete [] UncompressedBlock; + } +}; +#endif // BGZF_DATA + + +// --------------------------- // +// BamIndex-related typedefs +// --------------------------- // + +// offset for linear indexing +typedef vector LinearOffsetVector; + +// alignment 'chunk' boundaries +typedef pair ChunkPair; +typedef vector ChunkVector; + +// BAM bins.. each contains (binID, alignment 'chunks') +typedef pair BamBin; +typedef vector BinVector; + +// each reference sequence has a BinVector and LinearOffsetVector +typedef pair RefIndex; + +// full BamIndex defined as: +typedef vector BamIndex; + +class BamReader { + + // constructor/destructor + public: + BamReader(void); + ~BamReader(void); + + // public interface + public: + // closes the BAM file + void Close(void); + // retrieves header text + const string GetHeaderText(void) const; + // saves the alignment to the alignment archive + bool GetNextAlignment(BamAlignment& bAlignment); + // return number of reference sequences in BAM file + const int GetReferenceCount(void) const; + // return vector of RefData entries + const RefVector GetReferenceData(void) const; + // get refID from reference name + const int GetReferenceID(const string& refName) const; + // jumps to 'left' position on refID + bool Jump(int refID, unsigned int left = 0); + // opens the BAM file + void Open(const string& filename, const string& indexFilename = ""); + // move file pointer back to first alignment + bool Rewind(void); + + // internal methods + private: + // checks BGZF block header + bool BgzfCheckBlockHeader(char* header); + // closes the BAM file + void BgzfClose(void); + // de-compresses the current block + int BgzfInflateBlock(int blockLength); + // opens the BAM file for reading + void BgzfOpen(const string& filename); + // reads BGZF data into a byte buffer + unsigned int BgzfRead(char* data, const unsigned int dataLen); + // reads BGZF block + int BgzfReadBlock(void); + // seek to position in BAM file + bool BgzfSeek(int64_t position); + // get file position in BAM file + int64_t BgzfTell(void); + // unpacks a buffer into an unsigned int + static inline unsigned int BgzfUnpackUnsignedInt(char* buffer); + // unpacks a buffer into an unsigned short + static inline unsigned short BgzfUnpackUnsignedShort(char* buffer); + // calculate bins that overlap region ( left to reference end for now ) + int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); + // calculates alignment end position based on starting position and provided CIGAR operations + unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData); + // clear out (delete pointers in) index data structure + void ClearIndex(void); + // calculate file offset for first alignment chunk overlapping 'left' + int64_t GetOffset(int refID, unsigned int left); + // checks to see if alignment overlaps current region + bool IsOverlap(BamAlignment& bAlignment); + // retrieves header text from BAM file + void LoadHeaderData(void); + // builds BamIndex data structure from BAM index file + void LoadIndexData(FILE* indexStream); + // retrieves BAM alignment under file pointer + bool LoadNextAlignment(BamAlignment& bAlignment); + // builds reference data structure from BAM file + void LoadReferenceData(void); + // open BAM index file (if successful, loads index) + void OpenIndex(const string& indexFilename); + + // aligment file & index file data + private: + BgzfData m_BGZF; + string m_headerText; + BamIndex* m_index; + RefVector m_references; + bool m_isIndexLoaded; + int64_t m_alignmentsBeginOffset; + + // user-specified region values + private: + bool m_isRegionSpecified; + int m_currentRefID; + unsigned int m_currentLeft; + + // BAM character constants + private: + static const char* DNA_LOOKUP; + static const char* CIGAR_LOOKUP; +}; + +// unpacks a buffer into an unsigned int +inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// unpacks a buffer into an unsigned short +inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +// allows sorting/searching of a vector of pairs (instead of using maps) +template +class LookupKeyCompare { + + typedef pair< Key, Value > LookupData; + typedef typename LookupData::first_type Key_t; + + public: + bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); } + bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); } + bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); } + private: + bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; } +}; + +// return index of item if found, else return container.size() +template < typename InputIterator, typename EqualityComparable > +typename iterator_traits::difference_type +Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) { + return distance(begin, find(begin, end, item)); +} + diff --git a/BamReaderMain.cpp b/BamReaderMain.cpp index 5426763..d568fed 100644 --- a/BamReaderMain.cpp +++ b/BamReaderMain.cpp @@ -2,7 +2,7 @@ // Derek Barnett // Marth Lab, Boston College -// Last modified: 6 April 2009 +// Last modified: 12 May 2009 #include "BamReader.h" @@ -32,221 +32,73 @@ int main(int argc, char* argv[]) { const char* bamFilename = argv[1]; const char* bamIndexFilename = argv[2]; - string refName; - int refID; - - int alignmentCount; - - vector refNames; - + int alignmentsRead = 0; BamAlignment bAlignment; - BamAlignmentVector alignments; - - RefVector references; - - // ------------------------------------------------------------------------------------------------------ // - // Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information - // ------------------------------------------------------------------------------------------------------ // - BamReader bReader(bamFilename, bamIndexFilename); + BamReader bReader; + cerr << endl << "Opening BAM file (and index)" << endl << endl; + bReader.Open(bamFilename, bamIndexFilename); - cerr << endl; - cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... "; - - if ( bReader.Open() ) { - cerr << "ok" << endl; - } - else { - cerr << "error" << endl; - exit(1); + string header = bReader.GetHeaderText(); + cerr << "Printing header text..." << endl; + if ( header.empty() ) { + cerr << "No header provided." << endl << endl; + } else { + cerr << "----------------------" << endl; + cerr << "Header Text: " << endl; + cerr << "----------------------" << endl; + cerr << header << endl << endl; } - - // ------------------------------------------------------------ // - // Find out how many reference sequences are in BAM file - // ------------------------------------------------------------ // - - references = bReader.GetReferenceData(); - cerr << endl; - cerr << "Total number of ref seqs: " << references.size() << endl; - - // ---------------------------------------------------------------------------- // - // Get the names/lengths of all the reference sequences that have alignments - // ---------------------------------------------------------------------------- // - - cerr << endl; - cerr << "All ref sequences with alignments:" << endl; + RefVector references = bReader.GetReferenceData(); + cerr << "Printing references..." << endl; + RefVector::const_iterator refIter = references.begin(); + RefVector::const_iterator refEnd = references.end(); + for ( ; refIter != refEnd; ++refIter) { + cerr << "Reference entry: " << endl; + cerr << (*refIter).RefName << endl; + cerr << (*refIter).RefLength << endl; + cerr << "Has alignments? : " << ( ((*refIter).RefHasAlignments) ? "yes" : "no" ) << endl; + } cerr << endl; + alignmentsRead = 0; + while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) { + cerr << "Alignment " << alignmentsRead << endl; + cerr << bAlignment.Name << endl; + cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl; + ++alignmentsRead; + } - if ( !references.empty() ) { - - RefVector::iterator refIter = references.begin(); - RefVector::iterator refEnd = references.end(); - - refID = 0; - // iterate over reference names, print to STDERR - for( ; refIter != refEnd; ++refIter) { - - if ( (*refIter).RefHasAlignments ) { - cerr << "ID: " << refID << endl; - cerr << "Name: " << (*refIter).RefName << endl; - cerr << "Length: " << (*refIter).RefLength << endl; - cerr << endl; + cerr << "Jumping in BAM file" << endl; + if ( bReader.Jump(1, 5000) ) { + cerr << "Jumping seems ok - getting first 10 alignments that start at or after chr2:5000" << endl; + + alignmentsRead = 0; + while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) { + if ( bAlignment.Position >= 5000 ) { + cerr << "Alignment " << alignmentsRead << endl; + cerr << bAlignment.Name << endl; + cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl; + ++alignmentsRead; } - - ++refID; } } - // --------------------------------------------------------------------------------------------- // - // Get the SAM-style header text, if available (not available in the example file shown here) - // --------------------------------------------------------------------------------------------- // - - cerr << "------------------------------------" << endl; - cerr << "SAM header text: " << endl; - - // get (SAM) header text - string header = bReader.GetHeaderText(); - - cerr << ( (header.empty()) ? "no header data" : header ) << endl; - cerr << "------------------------------------" << endl; - - // --------------------------------------------------------------------------------------------- // - // Here we start accessing alignments - // The first method shows how to iterate over all reads in a BamFile - // This method will work on any BAM file (sorted/non-sorted, with/without an index) - // --------------------------------------------------------------------------------------------- // - - // Call Rewind() to make sure you're at the 1st alignment in the file - // Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment - // but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code - - cerr << "Getting (up to) first 1000 alignments" << endl; - - // start at 1st alignment + cerr << "Rewinding BAM file" << endl; if ( bReader.Rewind() ) { + cerr << "Rewind seems to be ok - getting first 10 alignments" << endl; - alignmentCount = 0; - while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) { - - // disregard unmapped alignments - if ( bAlignment.IsMapped() ) { - - ++alignmentCount; - - cout << "----------------------------" << endl; - cout << "Alignment " << alignmentCount << endl; - cout << bAlignment.Name << endl; - cout << bAlignment.AlignedBases << endl; - cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; - - cout << "Cigar Data: " << endl; - - vector::const_iterator cigarIter = bAlignment.CigarData.begin(); - vector::const_iterator cigarEnd = bAlignment.CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl; - } - - if(!bAlignment.TagData.empty()) { - cout << "Tag data is present." << endl; - string readGroup; - if(bAlignment.GetReadGroup(readGroup)) { - cout << "- read group: " << readGroup << endl; - } - } - } + alignmentsRead = 0; + while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) { + cerr << "Alignment " << alignmentsRead << endl; + cerr << bAlignment.Name << endl; + cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl; + ++alignmentsRead; } - - cerr << "Found " << alignmentCount << " alignments." << endl; - } else { cerr << "Could not rewind" << endl; } - - // ---------------------------------------------------------------------------------------------------------- // - // You can iterate over each individual alignment that overlaps a specified region - // Set the refID & left boundary parameters using Jump(refID, left) - // Jump() actually positions the file pointer actually before the left boundary - // This ensures that reads beginning before, but overlapping 'left' are included as well - // Client is responsible for setting/checking right boundary - see below - // ---------------------------------------------------------------------------------------------------------- // -/* - cerr << "Jumping to region" << endl; - - // get refID using a known refName - refName = "seq1"; - refID = bReader.GetRefID(refName); - if (refID == (int)references.size()) { - cerr << "Reference " << refName << " not found." << endl; - exit(1); } - // set left boundary - unsigned int leftBound = 500; - - // set right boundary - either user-specified number to set a discrete region - // OR you can query the BamReader for the end of the reference - unsigned int rightBound = references.at(refID).RefLength; - - cerr << endl; - cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl; - - // set region - specific region on reference - if ( bReader.Jump(refID, leftBound) ) { - - alignmentCount = 0; - while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) { - - if ( bAlignment.IsMapped() ) { - - ++alignmentCount; - - if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; } - - cout << "----------------------------" << endl; - cout << "Alignment " << alignmentCount << endl; - cout << bAlignment.Name << endl; - cout << bAlignment.AlignedBases << endl; - cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; - } - } - - cerr << "Found " << alignmentCount << " alignments." << endl; - } else { cerr << "Could not jump to region specified" << endl; } - - // ----------------------------------------------------------------------------------------------------- // - // You can 'rewind' back to beginning of BAM file at any time - // ----------------------------------------------------------------------------------------------------- // - - cerr << endl; - cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl; - - alignmentCount = 0; - if (bReader.Rewind() ) { - while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) { - - // disregard unmapped alignments - if ( bAlignment.IsMapped() ) { - - ++alignmentCount; - - cout << "----------------------------" << endl; - cout << "Alignment " << alignmentCount << endl; - cout << bAlignment.Name << endl; - cout << bAlignment.AlignedBases << endl; - cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl; - } - } - - cerr << "Found " << alignmentCount << " alignments." << endl; - } else { cerr << "Could not rewind" << endl; } -*/ - // ---------------------------------------------------------------------- // - // Close BamReader object (releases internal header/index data) and exit - // ---------------------------------------------------------------------- // - - cerr << endl; - cerr << "Closing BAM file: " << bamFilename << endl; - + cerr << "Closing BAM file..." << endl << endl; bReader.Close(); cerr << "Exiting..." << endl << endl; diff --git a/BamWriter.h b/BamWriter.h index d375612..18f8e0a 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -57,6 +57,8 @@ using namespace std; #define SIZEOF_INT 4 // define our BZGF structure +#ifndef BGZF_DATA +#define BGZF_DATA struct BgzfData { unsigned int UncompressedBlockSize; unsigned int CompressedBlockSize; @@ -95,6 +97,7 @@ struct BgzfData { if(UncompressedBlock) delete [] UncompressedBlock; } }; +#endif // BGZF_DATA class BamWriter { public: diff --git a/Makefile b/Makefile index 8862226..40e3ecd 100644 --- a/Makefile +++ b/Makefile @@ -1,39 +1,18 @@ -CC= gcc CXX= g++ -CFLAGS= -Wall -O3 -CXXFLAGS= $(CFLAGS) -DFLAGS= -D_IOLIB=2 #-D_NDEBUG -OBJS= BamReader.o bgzf.o -PROG= BamReaderTest -INCLUDES= -ARFLAGS= -crs +CXXFLAGS= -Wall -O3 +PROG= BamReaderTest BamConversion LIBS= -lz -SUBDIRS= . -.SUFFIXES:.c .cpp .o +all: $(PROG) -.c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ +BamReaderTest: BamReader.o BamReaderMain.o + $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamReaderMain.o $(LIBS) -.cpp.o: - $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ - -all: $(PROG) BamConversion - -lib:libbambc.a - -libbambc.a:$(OBJS) - $(AR) $(ARFLAGS) $@ $(OBJS) - -BamReaderTest:lib BamReaderMain.o - $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc - -BamConversion: lib BamWriter.o BamConversionMain.o - $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc - -BamMerge: lib BamMerge.o - $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamMerge.o $(LIBS) -L. -lbambc +BamConversion: BamReader.o BamWriter.o BamConversionMain.o + $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamConversionMain.o $(LIBS) +BamMerge: BamReader.o BamWriter.o BamMerge.o + $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamMerge.o $(LIBS) clean: - rm -fr gmon.out *.o *.a a.out $(PROG) BamConversion *~ + rm -fr gmon.out *.o *.a a.out *~ diff --git a/README b/README index 639bfd1..9553d1a 100644 --- a/README +++ b/README @@ -2,10 +2,27 @@ README ---------------- -Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory. -When you compile your program, add this to the g++ command: +Significant re-write of BamReader API as of 5/18/2009: +------------------------------------------------------- +No longer using bgzf.h/.c +No longer using libbambc.a library +No longer using STL Utilities +Minor changes to API syntax in opening BAM files with BamReader - see BamReader.h for details - -lz -L. -lbambc -and you should be good to go. (Those are lower case l's, as in 'lion'.) +Compile instructions +---------------------- +Short version: +See Makefile for compilation example. + +Longer version: +Copy BamAlignment.h, BamReader.* and/or BamWriter.* to working directory. +#include "BamReader.h" and/or "BamWriter.h" +Use standard compile commands, including the "-lz" parameter to access zlib functionality. +(Note that "-L. -lbambc" are no longer necessary and WILL CAUSE PROBLEMS for compiler/linker ) + + +Feel free to email me or pop by my desk with any questions, comments, suggestions, etc. + - Derek + diff --git a/STLUtilities.h b/STLUtilities.h deleted file mode 100644 index 7a9fbc5..0000000 --- a/STLUtilities.h +++ /dev/null @@ -1,146 +0,0 @@ -// STLUtilities.h - -// Derek Barnett -// Marth Lab, Boston College -// Last modified: 19 March 2009 - -#ifndef STL_UTILITIES_H -#define STL_UTILITIES_H - -#include -using std::distance; -using std::find; -using std::transform; - -#include -using std::unary_function; - -#include -using std::iterator_traits; - -#include -using std::string; - -#include -using std::pair; - -#include -using std::vector; - -// -------------------------------------------------------------------------------- // -// This file contains a sample of STL tricks I've gathered along the way. -// -// Many thanks throughout to 'Effective' series by Scott Meyers. -// Some code is lifted (almost) verbatim from his texts where applicable. -// ------------------------------------------------------------------------------- // - -// --------------------------------------------------------------------------------------------------------------------------------- // -// STL containers will delete the values they hold when the container is deleted or goes out of scope -// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects *** -// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new') -// Usage example: for_each( container.begin(), container.end(), DeleteObject() ) -// -// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson =) -// --------------------------------------------------------------------------------------------------------------------------------- // -struct DeleteObject { - template - void operator() (const T* p) const { - delete p; - } -}; - -template -void ClearPointerVector(vector& v) { - if ( !v.empty() ) { - for_each(v.begin(), v.end(), DeleteObject()); - v.clear(); - } -} - -// --------------------------------------------------------------------------------------------------------------------------------- // -// Query a vector (other containers? havent tried) for an element -// Returns the index of that element -// Returns vector::size() if not found -// Works with reverse iterators as well... index is counted backward from last element though -// --------------------------------------------------------------------------------------------------------------------------------- // -template < typename InputIterator, typename EqualityComparable > -typename iterator_traits::difference_type -Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) { - return distance(begin, find(begin, end, item)); -} - -// ----------------------------------------------------------------------------------------------------------------------// -// This next section is a sort of work-around for the bulky associative maps -// The idea is to use a *SORTED* vector of pair objects as a lookup vector -// LookupVector = vector< pair > -// ----------------------------------------------------------------------------------------------------------------------// - -// The following template classes allow a templatized comparison function for sorting & searching lookup vector - -// allows sorting by Key ( less ) -template -class LookupKeyCompare { - - typedef pair< Key, Value > LookupData; - typedef typename LookupData::first_type Key_t; - - public: - bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); } - bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); } - bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); } - private: - bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; } -}; - -// allows sorting by Value ( less ) -template -class LookupValueCompare { - - typedef pair< Key, Value > LookupData; - typedef typename LookupData::second_type Value_t; - - public: - bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); } - bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); } - bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); } - private: - bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; } -}; - -// The following template functions/structs allow you to pull all keys or all values from this lookup structure - -// pull Key from a data pair -template -struct RipKey: - public unary_function< pair, Key> { - Key operator() (pair dataPair) { - return dataPair.first; - } -}; - -// pull Value from a data pair -template -struct RipValue: - public unary_function< pair, Value> { - Value operator() (pair dataPair) { - return dataPair.second; - } -}; - -// pull all Keys from lookup, store in dest (overwrite contents of dest) -template -void RipKeys( vector< pair >& lookup, vector& dest) { - dest.clear(); - dest.reserve(lookup.size()); - transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey()); -} - -// pull all Values from lookup, store in dest (overwrite contents of dest) -template -void RipValues( vector< pair >& lookup, vector& dest) { - dest.clear(); - dest.reserve(lookup.size()); - transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue()); -} - -#endif /* STL_UTILITIES_H */ diff --git a/bgzf.c b/bgzf.c deleted file mode 100644 index 6481ee3..0000000 --- a/bgzf.c +++ /dev/null @@ -1,493 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ - -#include -#include -#include - -#include -#include -#include -#include "bgzf.h" - -#ifdef WIN32 - -#else -#include -extern off_t ftello(FILE *stream); -extern int fseeko(FILE *stream, off_t offset, int whence); -#endif - -typedef int8_t byte; - -static const int DEFAULT_BLOCK_SIZE = 64 * 1024; -static const int MAX_BLOCK_SIZE = 64 * 1024; - -static const int BLOCK_HEADER_LENGTH = 18; -static const int BLOCK_FOOTER_LENGTH = 8; - -static const int GZIP_ID1 = 31; -static const int GZIP_ID2 = 139; -static const int CM_DEFLATE = 8; -static const int FLG_FEXTRA = 4; -static const int OS_UNKNOWN = 255; -static const int BGZF_ID1 = 66; // 'B' -static const int BGZF_ID2 = 67; // 'C' -static const int BGZF_LEN = 2; -static const int BGZF_XLEN = 6; // BGZF_LEN+4 - -static const int GZIP_WINDOW_BITS = -15; // no zlib header -static const int Z_DEFAULT_MEM_LEVEL = 8; - - -inline -void -packInt16(uint8_t* buffer, uint16_t value) -{ - buffer[0] = value; - buffer[1] = value >> 8; -} - -inline -int -unpackInt16(const uint8_t* buffer) -{ - return (buffer[0] | (buffer[1] << 8)); -} - -inline -void -packInt32(uint8_t* buffer, uint32_t value) -{ - buffer[0] = value; - buffer[1] = value >> 8; - buffer[2] = value >> 16; - buffer[3] = value >> 24; -} - -inline -int -min(int x, int y) -{ - return (x < y) ? x : y; -} - -static -void -report_error(BGZF* fp, const char* message) { - fp->error = message; -} - -static -BGZF* -open_read(int fd) -{ - FILE* file = fdopen(fd, "r"); - BGZF* fp = (BGZF*)malloc(sizeof(BGZF)); - fp->file_descriptor = fd; - fp->open_mode = 'r'; - fp->owned_file = 0; - fp->file = file; - fp->uncompressed_block_size = MAX_BLOCK_SIZE; - fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->block_address = 0; - fp->block_offset = 0; - fp->block_length = 0; - fp->error = NULL; - return fp; -} - -static -BGZF* -open_write(int fd) -{ - FILE* file = fdopen(fd, "w"); - BGZF* fp = (BGZF*)malloc(sizeof(BGZF)); - fp->file_descriptor = fd; - fp->open_mode = 'w'; - fp->owned_file = 0; - fp->file = file; - fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; - fp->uncompressed_block = NULL; - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->block_address = 0; - fp->block_offset = 0; - fp->block_length = 0; - fp->error = NULL; - return fp; -} - -BGZF* -bgzf_open(const char* __restrict path, const char* __restrict mode) -{ - BGZF* fp = NULL; - if (strcasecmp(mode, "r") == 0) { - int oflag = O_RDONLY; - int fd = open(path, oflag); - fp = open_read(fd); - } else if (strcasecmp(mode, "w") == 0) { - int oflag = O_WRONLY | O_CREAT | O_TRUNC; - int fd = open(path, oflag, 0644); - fp = open_write(fd); - } - if (fp != NULL) { - fp->owned_file = 1; - } - return fp; -} - -BGZF* -bgzf_fdopen(int fd, const char * __restrict mode) -{ - if (strcasecmp(mode, "r") == 0) { - return open_read(fd); - } else if (strcasecmp(mode, "w") == 0) { - return open_write(fd); - } else { - return NULL; - } -} - -static -int -deflate_block(BGZF* fp, int block_length) -{ - // Deflate the block in fp->uncompressed_block into fp->compressed_block. - // Also adds an extra field that stores the compressed block length. - - byte* buffer = (byte*)fp->compressed_block; - int buffer_size = fp->compressed_block_size; - - // Init gzip header - buffer[0] = GZIP_ID1; - buffer[1] = GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[4] = 0; // mtime - buffer[5] = 0; - buffer[6] = 0; - buffer[7] = 0; - buffer[8] = 0; - buffer[9] = OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[11] = 0; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - buffer[15] = 0; - buffer[16] = 0; // placeholder for block length - buffer[17] = 0; - - // loop to retry for blocks that do not compress enough - int input_length = block_length; - int compressed_length = 0; - while (1) { - - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)fp->uncompressed_block; - zs.avail_in = input_length; - zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, - GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); - if (status != Z_OK) { - report_error(fp, "deflate init failed"); - return -1; - } - status = deflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - deflateEnd(&zs); - if (status == Z_OK) { - // Not enough space in buffer. - // Can happen in the rare case the input doesn't compress enough. - // Reduce the amount of input until it fits. - input_length -= 1024; - if (input_length <= 0) { - // should never happen - report_error(fp, "input reduction failed"); - return -1; - } - continue; - } - report_error(fp, "deflate failed"); - return -1; - } - status = deflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "deflate end failed"); - return -1; - } - compressed_length = zs.total_out; - compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if (compressed_length > MAX_BLOCK_SIZE) { - // should never happen - report_error(fp, "deflate overflow"); - return -1; - } - break; - } - - packInt16((uint8_t*)&buffer[16], compressed_length-1); - uint32_t crc = crc32(0L, NULL, 0L); - crc = crc32(crc, (Bytef*)fp->uncompressed_block, input_length); - packInt32((uint8_t*)&buffer[compressed_length-8], crc); - packInt32((uint8_t*)&buffer[compressed_length-4], input_length); - - int remaining = block_length - input_length; - if (remaining > 0) { - if (remaining > input_length) { - // should never happen (check so we can use memcpy) - report_error(fp, "remainder too large"); - return -1; - } - memcpy(fp->uncompressed_block, - (char*)fp->uncompressed_block + input_length, - remaining); - } - fp->block_offset = remaining; - return compressed_length; -} - -static -int -inflate_block(BGZF* fp, int block_length) -{ - // Inflate the block in fp->compressed_block into fp->uncompressed_block - - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)fp->compressed_block + 18; - zs.avail_in = block_length - 16; - zs.next_out = (Bytef*)fp->uncompressed_block; - zs.avail_out = fp->uncompressed_block_size; - - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if (status != Z_OK) { - report_error(fp, "inflate init failed"); - return -1; - } - status = inflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - inflateEnd(&zs); - report_error(fp, "inflate failed"); - return -1; - } - status = inflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "inflate failed"); - return -1; - } - return zs.total_out; -} - -static -int -check_header(const byte* header) -{ - return (header[0] == GZIP_ID1 && - header[1] == (byte) GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); -} - -static -int -read_block(BGZF* fp) -{ - byte header[BLOCK_HEADER_LENGTH]; - int64_t block_address = ftello(fp->file); - int count = fread(header, 1, sizeof(header), fp->file); - if (count == 0) { - fp->block_length = 0; - return 0; - } - if (count != sizeof(header)) { - report_error(fp, "read failed"); - return -1; - } - if (!check_header(header)) { - report_error(fp, "invalid block header"); - return -1; - } - int block_length = unpackInt16((uint8_t*)&header[16]) + 1; - byte* compressed_block = (byte*) fp->compressed_block; - memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - int remaining = block_length - BLOCK_HEADER_LENGTH; - count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); - if (count != remaining) { - report_error(fp, "read failed"); - return -1; - } - count = inflate_block(fp, block_length); - if (count < 0) { - return -1; - } - if (fp->block_length != 0) { - // Do not reset offset if this read follows a seek. - fp->block_offset = 0; - } - fp->block_address = block_address; - fp->block_length = count; - return 0; -} - -int -bgzf_read(BGZF* fp, void* data, int length) -{ - if (length <= 0) { - return 0; - } - if (fp->open_mode != 'r') { - report_error(fp, "file not open for reading"); - return -1; - } - - int bytes_read = 0; - byte* output = (byte*)data; - while (bytes_read < length) { - int available = fp->block_length - fp->block_offset; - if (available <= 0) { - if (read_block(fp) != 0) { - return -1; - } - available = fp->block_length - fp->block_offset; - if (available <= 0) { - break; - } - } - int copy_length = min(length-bytes_read, available); - byte* buffer = (byte*)fp->uncompressed_block; - memcpy(output, buffer + fp->block_offset, copy_length); - fp->block_offset += copy_length; - output += copy_length; - bytes_read += copy_length; - } - if (fp->block_offset == fp->block_length) { - fp->block_address = ftello(fp->file); - fp->block_offset = 0; - fp->block_length = 0; - } - return bytes_read; -} - -static -int -flush_block(BGZF* fp) -{ - while (fp->block_offset > 0) { - int block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) { - return -1; - } - int count = fwrite(fp->compressed_block, 1, block_length, fp->file); - if (count != block_length) { - report_error(fp, "write failed"); - return -1; - } - fp->block_address += block_length; - } - return 0; -} - -int -bgzf_write(BGZF* fp, const void* data, int length) -{ - if (fp->open_mode != 'w') { - report_error(fp, "file not open for writing"); - return -1; - } - - if (fp->uncompressed_block == NULL) { - fp->uncompressed_block = malloc(fp->uncompressed_block_size); - } - - const byte* input = (byte*)data; - int block_length = fp->uncompressed_block_size; - int bytes_written = 0; - while (bytes_written < length) { - int copy_length = min(block_length - fp->block_offset, length - bytes_written); - byte* buffer = (byte*)fp->uncompressed_block; - memcpy(buffer + fp->block_offset, input, copy_length); - fp->block_offset += copy_length; - input += copy_length; - bytes_written += copy_length; - if (fp->block_offset == block_length) { - if (flush_block(fp) != 0) { - break; - } - } - } - return bytes_written; -} - -int -bgzf_close(BGZF* fp) -{ - if (fp->open_mode == 'w') { - if (flush_block(fp) != 0) { - return -1; - } - if (fflush(fp->file) != 0) { - report_error(fp, "flush failed"); - return -1; - } - } - if (fp->owned_file) { - if (fclose(fp->file) != 0) { - return -1; - } - } - free(fp->uncompressed_block); - free(fp->compressed_block); - free(fp); - return 0; -} - -int64_t -bgzf_tell(BGZF* fp) -{ - return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); -} - -int64_t -bgzf_seek(BGZF* fp, int64_t pos, int where) -{ - if (fp->open_mode != 'r') { - report_error(fp, "file not open for read"); - return -1; - } - if (where != SEEK_SET) { - report_error(fp, "unimplemented seek option"); - return -1; - } - int block_offset = pos & 0xFFFF; - int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; - if (fseeko(fp->file, block_address, SEEK_SET) != 0) { - report_error(fp, "seek failed"); - return -1; - } - fp->block_length = 0; // indicates current block is not loaded - fp->block_address = block_address; - fp->block_offset = block_offset; - return 0; -} - diff --git a/bgzf.h b/bgzf.h deleted file mode 100644 index 080db1d..0000000 --- a/bgzf.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ - -#ifndef __BGZF_H -#define __BGZF_H - -#include -#include "zlib.h" - -#ifdef WIN32 -#include - -typedef char int8_t; -typedef unsigned char uint8_t; -typedef short int16_t; -typedef unsigned short uint16_t; -typedef int int32_t; -typedef unsigned int uint32_t; -typedef long long int64_t; -typedef unsigned long long uint64_t; - -#define ftello(a) _ftelli64(a) -#define fseeko(a,b,c) _fseeki64(a,b,c) -#define strcasecmp _stricmp -#define open _open -#define fdopen _fdopen -#else -#include -#include -#endif - -typedef struct { - int file_descriptor; - char open_mode; // 'r' or 'w' - bool owned_file; - FILE* file; - int uncompressed_block_size; - int compressed_block_size; - void* uncompressed_block; - void* compressed_block; - int64_t block_address; - int block_length; - int block_offset; - const char* error; -} BGZF; - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Open an existing file descriptor for reading or writing. - * Mode must be either "r" or "w". - * A subsequent bgzf_close will not close the file descriptor. - * Returns null on error. - */ -BGZF* bgzf_fdopen(int fd, const char* __restrict mode); - -/* - * Open the specified file for reading or writing. - * Mode must be either "r" or "w". - * Returns null on error. - */ -BGZF* bgzf_open(const char* path, const char* __restrict mode); - -/* - * Close the BGZ file and free all associated resources. - * Does not close the underlying file descriptor if created with bgzf_fdopen. - * Returns zero on success, -1 on error. - */ -int bgzf_close(BGZF* fp); - -/* - * Read up to length bytes from the file storing into data. - * Returns the number of bytes actually read. - * Returns zero on end of file. - * Returns -1 on error. - */ -int bgzf_read(BGZF* fp, void* data, int length); - -/* - * Write length bytes from data to the file. - * Returns the number of bytes written. - * Returns -1 on error. - */ -int bgzf_write(BGZF* fp, const void* data, int length); - -/* - * Return a virtual file pointer to the current location in the file. - * No interpetation of the value should be made, other than a subsequent - * call to bgzf_seek can be used to position the file at the same point. - * Return value is non-negative on success. - * Returns -1 on error. - */ -int64_t bgzf_tell(BGZF* fp); - -/* - * Set the file to read from the location specified by pos, which must - * be a value previously returned by bgzf_tell for this file (but not - * necessarily one returned by this file handle). - * The where argument must be SEEK_SET. - * Seeking on a file opened for write is not supported. - * Returns zero on success, -1 on error. - */ -int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); - -#ifdef __cplusplus -} -#endif - -#endif -- 2.39.2