]> git.donarmstrong.com Git - bamtools.git/commitdiff
Major overhaul of BamReader. No longer relying on bgzf.* API. Sped up random-access...
authorbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Mon, 18 May 2009 16:25:20 +0000 (16:25 +0000)
committerbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Mon, 18 May 2009 16:25:20 +0000 (16:25 +0000)
git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@19 9efb377e-2e27-44b9-b91a-ec4abb80ed8b

BamConversionMain.cpp
BamReader.cpp
BamReader.h
BamReaderMain.cpp
BamWriter.h
Makefile
README
STLUtilities.h [deleted file]
bgzf.c [deleted file]
bgzf.h [deleted file]

index 457e58e36749bf4514779a3d0759e6ff25539643..0e6333b4cf5bee061335a428e2a89600b1f156be 100644 (file)
@@ -17,12 +17,7 @@ int main(int argc, char* argv[]) {
 
        // open our BAM reader
        BamReader reader;
-       reader.SetFilename(inputFilename);
-
-       if(!reader.Open()) {
-               cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl;
-               exit(1);
-       }
+       reader.Open(inputFilename);
 
        // retrieve the SAM header text
        string samHeader = reader.GetHeaderText();
@@ -43,4 +38,4 @@ int main(int argc, char* argv[]) {
        writer.Close();
 
        return 0;
-}
\ No newline at end of file
+}
index f1c70611f3cc7c900b5301db92a43020ab518b97..b5f15ef33a1e65d40b84bb1a9a6de5dc959de0a6 100644 (file)
@@ -1,10 +1,16 @@
-// BamReader.cpp
-
-// Derek Barnett
-// Marth Lab, Boston College
-// Last modified: 23 April 2009
+// ***************************************************************************
+// BamReader (c) 2009 Derek Barnett
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
 
 #include "BamReader.h"
+//#include "STLUtilities.h"
+
+#include <cstring>
+
 #include <iostream>
 using std::cerr;
 using std::endl;
@@ -13,194 +19,185 @@ using std::endl;
 const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
 const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
 
-BamReader::BamReader(const char* filename, const char* indexFilename) 
-       : m_filename( (char*)filename )
-       , m_indexFilename( (char*)indexFilename )
-       , m_file(0)
-       , m_index(NULL)
-       , m_headerText("")
-       , m_isOpen(false)
+// constructor
+BamReader::BamReader(void)
+       : m_index(NULL)
        , m_isIndexLoaded(false)
+       , m_alignmentsBeginOffset(0)
        , m_isRegionSpecified(false)
-       , m_isCalculateAlignedBases(true)
        , m_currentRefID(0)
        , m_currentLeft(0)
-       , m_alignmentsBeginOffset(0)
-{
-       Open();
-}
+{ }
 
+// destructor
 BamReader::~BamReader(void) {
-
-       // close file
-       if ( m_isOpen ) { Close(); }
+       m_headerText.clear();
+       ClearIndex();
+       Close();
 }
 
-// open BAM file
-bool BamReader::Open(void) {
-
-       if (!m_isOpen && m_filename != NULL ) {
-
-               // open file
-               m_file = bam_open(m_filename, "r"); 
-               
-               // get header info && index info
-               if ( (m_file != NULL) && LoadHeader() ) {
-
-                       // save file offset where alignments start
-                       m_alignmentsBeginOffset = bam_tell(m_file);
-                       
-                       // set open flag
-                       m_isOpen = true;
-               }
-
-               // try to open (and load) index data, if index file given
-               if ( m_indexFilename != NULL ) {
-                       OpenIndex();
-               }
-       }
-
-       return m_isOpen;
+// checks BGZF block header
+bool BamReader::BgzfCheckBlockHeader(char* header) {
+
+       return (header[0] == GZIP_ID1 &&
+            header[1] == (char)GZIP_ID2 &&
+            header[2] == Z_DEFLATED &&
+            (header[3] & FLG_FEXTRA) != 0 &&
+            BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
+            header[12] == BGZF_ID1 &&
+            header[13] == BGZF_ID2 &&
+            BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN);
 }
 
-bool BamReader::OpenIndex(void) {
-
-       if ( m_indexFilename && !m_isIndexLoaded ) {
-               m_isIndexLoaded = LoadIndex();
-       }
-       return m_isIndexLoaded;
+// closes the BAM file
+void BamReader::BgzfClose(void) {
+       m_BGZF.IsOpen = false;
+       fclose(m_BGZF.Stream);
 }
 
-// close BAM file
-bool BamReader::Close(void) {
+// de-compresses the current block
+int BamReader::BgzfInflateBlock(int blockLength) {
        
-       if (m_isOpen) {
+       // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
+    z_stream zs;
+    zs.zalloc    = NULL;
+    zs.zfree     = NULL;
+    zs.next_in   = (Bytef*)m_BGZF.CompressedBlock + 18;
+    zs.avail_in  = blockLength - 16;
+    zs.next_out  = (Bytef*)m_BGZF.UncompressedBlock;
+    zs.avail_out = m_BGZF.UncompressedBlockSize;
+
+    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+    if (status != Z_OK) {
+        printf("inflateInit failed\n");
+        exit(1);
+    }
 
-               // close file
-               int ret = bam_close(m_file);
-               
-               // delete index info
-               if ( m_index != NULL) { delete m_index; }
+    status = inflate(&zs, Z_FINISH);
+    if (status != Z_STREAM_END) {
+        inflateEnd(&zs);
+        printf("inflate failed\n");
+        exit(1);
+    }
 
-               // clear open flag
-               m_isOpen = false;
+    status = inflateEnd(&zs);
+    if (status != Z_OK) {
+        printf("inflateEnd failed\n");
+        exit(1);
+    }
 
-               // clear index flag
-               m_isIndexLoaded = false;
+    return zs.total_out;
+}
 
-               // clear region flag
-               m_isRegionSpecified = false;
+// opens the BAM file for reading
+void BamReader::BgzfOpen(const string& filename) {
 
-               // return success/fail of bam_close
-               return (ret == 0);
-       } 
+       m_BGZF.Stream = fopen(filename.c_str(), "rb");
+       if(!m_BGZF.Stream) {
+               cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl;
+               exit(1);
+       }
 
-       return true;
+       m_BGZF.IsOpen = true;
 }
 
-// get BAM filename
-const char* BamReader::Filename(void) const { 
-       return (const char*)m_filename; 
-}
+// reads BGZF data into buffer
+unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
 
-// set BAM filename
-void BamReader::SetFilename(const char* filename) {
-       m_filename = (char*)filename;
-}
+    if (dataLength == 0) { return 0; }
 
-// get BAM Index filename
-const char* BamReader::IndexFilename(void) const { 
-       return (const char*)m_indexFilename; 
-}
+       char* output = data;
+    unsigned int numBytesRead = 0;
+    while (numBytesRead < dataLength) {
 
-// set BAM Index filename
-void BamReader::SetIndexFilename(const char* indexFilename) {
-       m_indexFilename = (char*)indexFilename;
-}
+        int bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
+        if (bytesAvailable <= 0) {
+            if ( BgzfReadBlock() != 0 ) { return -1; }
+            bytesAvailable = m_BGZF.BlockLength - m_BGZF.BlockOffset;
+            if ( bytesAvailable <= 0 ) { break; }
+        }
 
-// return full header text
-const string BamReader::GetHeaderText(void) const { 
-       return m_headerText; 
-}
+               char* buffer   = m_BGZF.UncompressedBlock;
+        int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
+        memcpy(output, buffer + m_BGZF.BlockOffset, copyLength);
 
-// return number of reference sequences in BAM file
-const int BamReader::GetReferenceCount(void) const { 
-       return m_references.size();
-}
+        m_BGZF.BlockOffset += copyLength;
+        output             += copyLength;
+        numBytesRead       += copyLength;
+    }
 
-// get RefID from reference name
-const int BamReader::GetRefID(string refName) const { 
-       
-       vector<string> refNames;
-       RefVector::const_iterator refIter = m_references.begin();
-    RefVector::const_iterator refEnd  = m_references.end();
-    for ( ; refIter != refEnd; ++refIter) {
-               refNames.push_back( (*refIter).RefName );
+    if ( m_BGZF.BlockOffset == m_BGZF.BlockLength ) {
+               m_BGZF.BlockAddress = ftello64(m_BGZF.Stream);                                          
+        m_BGZF.BlockOffset  = 0;
+        m_BGZF.BlockLength  = 0;
     }
 
-       // return 'index-of' refName (if not found, returns refNames.size())
-       return Index( refNames.begin(), refNames.end(), refName );
+       return numBytesRead;
 }
 
-const RefVector BamReader::GetReferenceData(void) const {
-       return m_references;
-}
+int BamReader::BgzfReadBlock(void) {
 
-bool BamReader::Jump(int refID, unsigned int left) {
+    char    header[BLOCK_HEADER_LENGTH];
+    int64_t blockAddress = ftello(m_BGZF.Stream);
 
-       // if index available, and region is valid
-       if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
-               m_currentRefID = refID;
-               m_currentLeft  = left;
-               m_isRegionSpecified = true;
-               
-               int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
-               if ( offset == -1 ) { return false; }
-               else { return ( bam_seek(m_file, offset, SEEK_SET) == 0 ); }
-       }
-       return false;
-}
+    int count = fread(header, 1, sizeof(header), m_BGZF.Stream);
+       if (count == 0) {
+        m_BGZF.BlockLength = 0;
+        return 0;
+    }
 
-bool BamReader::Rewind(void) {
+    if (count != sizeof(header)) {
+        printf("read block failed - count != sizeof(header)\n");
+        return -1;
+    }
 
-       int refID = 0;
-       int refCount = m_references.size();
-       for ( ; refID < refCount; ++refID ) {
-               if ( m_references.at(refID).RefHasAlignments ) { break; } 
-       }
+    if (!BgzfCheckBlockHeader(header)) {
+        printf("read block failed - CheckBgzfBlockHeader() returned false\n");
+        return -1;
+    }
 
-       m_currentRefID = refID;
-       m_currentLeft = 0;
-       m_isRegionSpecified = false;
+    int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
+    char* compressedBlock = m_BGZF.CompressedBlock;
+    memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
+    int remaining = blockLength - BLOCK_HEADER_LENGTH;
 
-       return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
-}      
+    count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF.Stream);
+    if (count != remaining) {
+        printf("read block failed - count != remaining\n");
+        return -1;
+    }
 
-// get next alignment from specified region
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
+    count = BgzfInflateBlock(blockLength);
+    if (count < 0) { return -1; }
 
-       // try to load 'next' read
-       if ( LoadNextAlignment(bAlignment) ) {
+    if (m_BGZF.BlockLength != 0) {
+        m_BGZF.BlockOffset = 0;
+    }
 
-               // if specified region, check for region overlap
-               if ( m_isRegionSpecified ) {
+    m_BGZF.BlockAddress = blockAddress;
+    m_BGZF.BlockLength  = count;
+    return 0;
+}
 
-                       // if overlap, return true
-                       if ( IsOverlap(bAlignment) ) { return true; }
-                       // if not, try the next alignment
-                       else { return GetNextAlignment(bAlignment); }
-               } 
+// move file pointer to specified offset
+bool BamReader::BgzfSeek(int64_t position) {
 
-               // not using region, valid read detected, return success
-               else { return true; }
-       }
+       int     blockOffset  = (position & 0xFFFF);
+    int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+    if (fseeko(m_BGZF.Stream, blockAddress, SEEK_SET) != 0) {
+        printf("ERROR: Unable to seek in BAM file\n");
+               exit(1);
+    }
 
-       // no valid alignment to load
-       return false;
+    m_BGZF.BlockLength  = 0;
+    m_BGZF.BlockAddress = blockAddress;
+    m_BGZF.BlockOffset  = blockOffset;
+       return true;
 }
 
-void BamReader::SetCalculateAlignedBases(bool flag) {
-       m_isCalculateAlignedBases = flag;
+// get file position in BAM file
+int64_t BamReader::BgzfTell(void) {
+       return ( (m_BGZF.BlockAddress << 16) | (m_BGZF.BlockOffset & 0xFFFF) );
 }
 
 int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
@@ -225,108 +222,152 @@ int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BI
        return i;
 }
 
-uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
+unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
 
        // initialize alignment end to starting position
-       uint32_t alignEnd = position;
+       unsigned int alignEnd = position;
 
        // iterate over cigar operations
        vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
        vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
        for ( ; cigarIter != cigarEnd; ++cigarIter) {
-               if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
+               char cigarType = (*cigarIter).Type;
+               if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
                        alignEnd += (*cigarIter).Length;
                }
        }
        return alignEnd;
 }
 
-int64_t BamReader::GetOffset(int refID, unsigned int left) {
+void BamReader::ClearIndex(void) {
+
+       if ( m_index ) {
+               // iterate over references
+               vector<RefIndex*>::iterator refIter = m_index->begin();
+               vector<RefIndex*>::iterator refEnd  = m_index->end();
+               for ( ; refIter != refEnd; ++refIter) {
+                       RefIndex* aRef = (*refIter);
+                       if ( aRef ) {
+                               // clear out BAM bins
+                               if ( aRef->first ) {
+                                       BinVector::iterator binIter = (aRef->first)->begin();
+                                       BinVector::iterator binEnd  = (aRef->first)->end();
+                                       for ( ; binIter != binEnd; ++binIter ) {
+                                               ChunkVector* chunks = (*binIter).second;
+                                               if ( chunks ) { delete chunks; }
+                                       }
+                                       delete aRef->first;
+                               }
+                               // clear BAM linear offsets
+                               if ( aRef->second ) { delete aRef->second; }
+                               delete aRef;
+                       }
+               }
+               delete m_index;
+       }
+}
 
-       //  make space for bins
-       uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);                 
-       
-       // returns number of bins overlapping (left, right)
-       // stores indices of those bins in 'bins'
-       int numBins = BinsFromRegion(refID, left, bins);                                
+// closes the BAM file
+void BamReader::Close(void) {
+       if(m_BGZF.IsOpen) { BgzfClose(); }
+       if(m_index)       { delete m_index; m_index = NULL; }
+       m_isRegionSpecified = false;
+}
 
-       // access index data for refID
-       RefIndex* refIndex = m_index->at(refID);
+const string BamReader::GetHeaderText(void) const {
+       return m_headerText;
+}
 
-       // get list of BAM bins for this reference sequence
-       BinVector* refBins = refIndex->first;
+const int BamReader::GetReferenceCount(void) const {
+       return m_references.size();
+}
 
-       sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+const RefVector BamReader::GetReferenceData(void) const {
+       return m_references;
+}
 
-       // declare ChunkVector
-       ChunkVector regionChunks;
+const int BamReader::GetReferenceID(const string& refName) const {
 
-       // declaure LinearOffsetVector
-       LinearOffsetVector* linearOffsets = refIndex->second;
+       // retrieve names from reference data
+       vector<string> refNames;
+       RefVector::const_iterator refIter = m_references.begin();
+    RefVector::const_iterator refEnd  = m_references.end();
+    for ( ; refIter != refEnd; ++refIter) {
+               refNames.push_back( (*refIter).RefName );
+    }
 
-       // some sort of linear offset vs bin index voodoo... not completely sure what's going here
-       uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+       // return 'index-of' refName ( if not found, returns refNames.size() )
+       return Index( refNames.begin(), refNames.end(), refName );
+}
 
-       BinVector::iterator binBegin = refBins->begin();
-       BinVector::iterator binEnd   = refBins->end();
+// get next alignment (from specified region, if given)
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
 
-       // iterate over bins overlapping region, count chunks
-       for (int i = 0; i < numBins; ++i) {
+       // if valid alignment available
+       if ( LoadNextAlignment(bAlignment) ) {
                
-               // look for bin with ID=bin[i]
-               BinVector::iterator binIter = binBegin;
-
-               for ( ; binIter != binEnd; ++binIter ) {
+               // if region not specified, return success
+               if ( !m_isRegionSpecified ) { return true; }
                
-                       // if found, increment n_off by number of chunks for each bin
-                       if ( (*binIter).first == (uint32_t)bins[i] ) { 
-                               
-                               // iterate over chunks in that bin
-                               ChunkVector* binChunks = (*binIter).second;
-                               
-                               ChunkVector::iterator chunkIter = binChunks->begin();
-                               ChunkVector::iterator chunkEnd  = binChunks->end();
-                               for ( ; chunkIter != chunkEnd; ++chunkIter) {
-                               
-                                       // if right bound of pair is greater than min_off (linear offset value), store pair
-                                       if ( (*chunkIter).second > minOffset) { 
-                                               regionChunks.push_back( (*chunkIter) ); 
-                                       }
-                               }
-                       }
+               // load next alignment until region overlap is found
+               while ( !IsOverlap(bAlignment) ) {
+                       // if no valid alignment available (likely EOF) return failure
+                       if ( !LoadNextAlignment(bAlignment) ) { return false; }
                }
-       }
+               
+               // return success (alignment found that overlaps region)
+               return true;
+       } 
+       
+       // no valid alignment
+       else { return false; }
+}
 
-       // clean up temp array
-       free(bins);
+int64_t BamReader::GetOffset(int refID, unsigned int left) {
 
-       // there should be at least 1
-       if(regionChunks.size() == 0) { return -1; }
+       // calculate which bins overlap this region
+       uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+       int numBins = BinsFromRegion(refID, left, bins);
 
-       // sort chunks by start position
-       sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
+       // get bins for this reference
+       RefIndex* refIndex = m_index->at(refID);
+       BinVector* refBins = refIndex->first;
 
-       // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
-       int numOffsets = regionChunks.size();   
-       for (int i = 1; i < numOffsets; ++i) {
-               if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
-                       regionChunks.at(i-1).second = regionChunks.at(i).first;
-               }
-       }
+       // get minimum offset to consider
+       LinearOffsetVector* linearOffsets = refIndex->second;
+       uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+
+       // store offsets to beginning of alignment 'chunks' 
+       std::vector<int64_t> chunkStarts;
        
-       // merge adjacent chunks
-       int l = 0;
-       for (int i = 1; i < numOffsets; ++i) {
-               // if adjacent, expand boundaries of (merged) chunk
-               if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
-                       regionChunks.at(l).second = regionChunks.at(i).second;
+       // reference bin iterators
+       BinVector::const_iterator binIter;
+       BinVector::const_iterator binBegin = refBins->begin();
+       BinVector::const_iterator binEnd   = refBins->end();
+       
+       // store all alignment 'chunk' starts for bins in this region
+       for (int i = 0; i < numBins; ++i ) {
+               binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
+               if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
+                       ChunkVector* binChunks = (*binIter).second;
+                       ChunkVector::const_iterator chunkIter = binChunks->begin();
+                       ChunkVector::const_iterator chunkEnd  = binChunks->end();
+                       for ( ; chunkIter != chunkEnd; ++chunkIter) {
+                               if ( (*chunkIter).second > minOffset ) {
+                                       chunkStarts.push_back( (*chunkIter).first );
+                               }       
+                       }
                }
-               // else, move on to next (merged) chunk index
-               else { regionChunks.at(++l) = regionChunks.at(i); }
        }
-
-       // return beginning file offset of first chunk for region
-       return (int64_t)regionChunks.at(0).first;
+       
+       // clean up memory
+       free(bins);
+       
+       // if no alignments found
+       if ( chunkStarts.empty() ) { return -1; }
+       
+       // else return smallest offset for alignment starts 
+       else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
 }
 
 bool BamReader::IsOverlap(BamAlignment& bAlignment) {
@@ -337,92 +378,65 @@ bool BamReader::IsOverlap(BamAlignment& bAlignment) {
        // read starts after left boundary
        if ( bAlignment.Position >= m_currentLeft) { return true; }
 
-       // get alignment end
-       uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
-
        // return whether alignment end overlaps left boundary
-       return ( alignEnd >= m_currentLeft );
+       return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
 }
 
-bool BamReader::LoadHeader(void) {
+bool BamReader::Jump(int refID, unsigned int left) {
+
+       // if index available, and region is valid
+       if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
+               m_currentRefID = refID;
+               m_currentLeft  = left;
+               m_isRegionSpecified = true;
+               
+               int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
+               if ( offset == -1 ) { return false; }
+               else { return BgzfSeek(offset); }
+       }
+       return false;
+}
 
+void BamReader::LoadHeaderData(void) {
+       
        // check to see if proper BAM header
-       char buf[4];
-       if (bam_read(m_file, buf, 4) != 4) { return false; }
-       if (strncmp(buf, "BAM\001", 4)) {
-               fprintf(stderr, "wrong header type!\n");
-               return false;
+       char buffer[4];
+       if (BgzfRead(buffer, 4) != 4) { 
+               printf("Could not read header type\n");
+               exit(1); 
+       }
+       if (strncmp(buffer, "BAM\001", 4)) {
+               printf("wrong header type!\n");
+               exit(1);
        }
        
        // get BAM header text length
-       int32_t headerTextLength;
-       bam_read(m_file, &headerTextLength, 4);
+       BgzfRead(buffer, 4);
+       const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
 
        // get BAM header text
        char* headerText = (char*)calloc(headerTextLength + 1, 1);
-       bam_read(m_file, headerText, headerTextLength);
+       BgzfRead(headerText, headerTextLength);
        m_headerText = (string)((const char*)headerText);
        
        // clean up calloc-ed temp variable
        free(headerText);
-
-       // get number of reference sequences
-       int32_t numberRefSeqs;
-       bam_read(m_file, &numberRefSeqs, 4);
-       if (numberRefSeqs == 0) { return false; }
-
-       m_references.reserve((int)numberRefSeqs);
-       
-       // reference variables
-       int32_t  refNameLength;
-       char*    refName;
-       uint32_t refLength;
-
-       // iterate over all references in header
-       for (int i = 0; i != numberRefSeqs; ++i) {
-
-               // get length of reference name
-               bam_read(m_file, &refNameLength, 4);
-               refName = (char*)calloc(refNameLength, 1);
-
-               // get reference name and reference sequence length
-               bam_read(m_file, refName, refNameLength);
-               bam_read(m_file, &refLength, 4);
-
-               // store data for reference
-               RefData aReference;
-               aReference.RefName   = (string)((const char*)refName);
-               aReference.RefLength = refLength;
-               m_references.push_back(aReference);
-
-               // clean up calloc-ed temp variable
-               free(refName);
-       }
-       
-       return true;
 }
 
-bool BamReader::LoadIndex(void) {
-
-       // check to see if index file exists
-       FILE* indexFile;
-       if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
-               fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
-               return false;
-       }
+void BamReader::LoadIndexData(FILE* indexStream) {
 
        // see if index is valid BAM index
        char magic[4];
-       fread(magic, 1, 4, indexFile);
+       fread(magic, 1, 4, indexStream);
        if (strncmp(magic, "BAI\1", 4)) {
-               fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
-               fclose(indexFile);
-               return false;
+               printf("Problem with index file - invalid format.\n");
+               fclose(indexStream);
+               exit(1);
        }
 
        // get number of reference sequences
        uint32_t numRefSeqs;
-       fread(&numRefSeqs, 4, 1, indexFile);
+       fread(&numRefSeqs, 4, 1, indexStream);
        
        // intialize BamIndex data structure
        m_index = new BamIndex;
@@ -433,7 +447,7 @@ bool BamReader::LoadIndex(void) {
                
                // get number of bins for this reference sequence
                int32_t numBins;
-               fread(&numBins, 4, 1, indexFile);
+               fread(&numBins, 4, 1, indexStream);
                
                if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
 
@@ -446,11 +460,11 @@ bool BamReader::LoadIndex(void) {
                        
                        // get binID 
                        uint32_t binID;
-                       fread(&binID, 4, 1, indexFile);
+                       fread(&binID, 4, 1, indexStream);
                        
                        // get number of regionChunks in this bin
                        uint32_t numChunks;
-                       fread(&numChunks, 4, 1, indexFile);
+                       fread(&numChunks, 4, 1, indexStream);
                        
                        // intialize ChunkVector
                        ChunkVector* regionChunks = new ChunkVector;
@@ -462,22 +476,28 @@ bool BamReader::LoadIndex(void) {
                                // get chunk boundaries (left, right) 
                                uint64_t left;
                                uint64_t right;
-                               fread(&left, 8, 1, indexFile);
-                               fread(&right, 8, 1, indexFile);
+                               fread(&left, 8, 1, indexStream);
+                               fread(&right, 8, 1, indexStream);
                                
                                // save ChunkPair
                                regionChunks->push_back( ChunkPair(left, right) );
                        }
                        
+                       // sort chunks for this bin
+                       sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
+
                        // save binID, chunkVector for this bin
                        bins->push_back( BamBin(binID, regionChunks) );
                }
                
+               // sort bins by binID
+               sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+
                // load linear index for this reference sequence
                
                // get number of linear offsets
                int32_t numLinearOffsets;
-               fread(&numLinearOffsets, 4, 1, indexFile);
+               fread(&numLinearOffsets, 4, 1, indexStream);
                
                // intialize LinearOffsetVector
                LinearOffsetVector* linearOffsets = new LinearOffsetVector;
@@ -487,179 +507,233 @@ bool BamReader::LoadIndex(void) {
                for (int j = 0; j < numLinearOffsets; ++j) {
                        // get a linear offset
                        uint64_t linearOffset;
-                       fread(&linearOffset, 8, 1, indexFile);
+                       fread(&linearOffset, 8, 1, indexStream);
                        // store linear offset
                        linearOffsets->push_back(linearOffset);
                }
                
+               // sort linear offsets
+               sort( linearOffsets->begin(), linearOffsets->end() );
+
                // store index data for that reference sequence
                m_index->push_back( new RefIndex(bins, linearOffsets) );
        }
        
        // close index file (.bai) and return
-       fclose(indexFile);
-       return true;
+       fclose(indexStream);
 }
 
 bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
 
-       // check valid alignment block header data
-       int32_t block_len;
-       int32_t ret;
-       uint32_t x[8];
-
-       int32_t bytesRead = 0;
-
        // read in the 'block length' value, make sure it's not zero
-       if ( (ret = bam_read(m_file, &block_len, 4)) == 0 )        { return false; }
-       bytesRead += 4;
+       char buffer[4];
+       BgzfRead(buffer, 4);
+       const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
+       if ( blockLength == 0 ) { return false; }
+
+       // keep track of bytes read as method progresses
+       int bytesRead = 4;
 
-       // read in core alignment data, make the right size of data was read 
-       if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
+       // read in core alignment data, make sure the right size of data was read 
+       char x[BAM_CORE_SIZE];
+       if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
        bytesRead += BAM_CORE_SIZE;
 
-       // set BamAlignment 'core' data
-       bAlignment.RefID         = x[0]; 
-       bAlignment.Position      = x[1];
-       bAlignment.Bin           = x[2]>>16; 
-       bAlignment.MapQuality    = x[2]>>8&0xff; 
-       bAlignment.AlignmentFlag = x[3]>>16; 
-       bAlignment.MateRefID     = x[5]; 
-       bAlignment.MatePosition  = x[6]; 
-       bAlignment.InsertSize    = x[7];
-
-       // fetch & store often-used lengths for character data parsing
-       unsigned int queryNameLength     = x[2]&0xff;
-       unsigned int numCigarOperations  = x[3]&0xffff;
-       unsigned int querySequenceLength = x[4];
+       // set BamAlignment 'core' data and character data lengths
+       unsigned int tempValue;
+       unsigned int queryNameLength;
+       unsigned int numCigarOperations;
+       unsigned int querySequenceLength;
+
+       bAlignment.RefID    = BgzfUnpackUnsignedInt(&x[0]);
+       bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
+
+       tempValue             = BgzfUnpackUnsignedInt(&x[8]);           
+       bAlignment.Bin        = tempValue >> 16;
+       bAlignment.MapQuality = tempValue >> 8 & 0xff;
+       queryNameLength       = tempValue & 0xff;
+
+       tempValue                = BgzfUnpackUnsignedInt(&x[12]);       
+       bAlignment.AlignmentFlag = tempValue >> 16;
+       numCigarOperations       = tempValue & 0xffff;
+
+       querySequenceLength     = BgzfUnpackUnsignedInt(&x[16]);
+       bAlignment.MateRefID    = BgzfUnpackUnsignedInt(&x[20]);
+       bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
+       bAlignment.InsertSize   = BgzfUnpackUnsignedInt(&x[28]);
+
+       // calculate lengths/offsets
+       const unsigned int dataLength      = blockLength - BAM_CORE_SIZE;
+       const unsigned int cigarDataOffset = queryNameLength;
+       const unsigned int seqDataOffset   = cigarDataOffset + (numCigarOperations * 4);
+       const unsigned int qualDataOffset  = seqDataOffset + (querySequenceLength+1)/2;
+       const unsigned int tagDataOffset   = qualDataOffset + querySequenceLength;
+       const unsigned int tagDataLen      = dataLength - tagDataOffset;
        
-       // get length of character data
-       int dataLength = block_len - BAM_CORE_SIZE;
-
        // set up destination buffers for character data
-       uint8_t*  allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
-       uint32_t* cigarData   = (uint32_t*)(allCharData+queryNameLength);
-
-       const unsigned int tagDataOffset = (numCigarOperations * 4) + queryNameLength + (querySequenceLength + 1) / 2 + querySequenceLength;
-       const unsigned int tagDataLen = dataLength - tagDataOffset;
-       char* tagData = ((char*)allCharData) + tagDataOffset;
+       char* allCharData   = (char*)calloc(sizeof(char), dataLength);
+       uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
+       char* seqData       = ((char*)allCharData) + seqDataOffset;
+       char* qualData      = ((char*)allCharData) + qualDataOffset;
+       char* tagData       = ((char*)allCharData) + tagDataOffset;
        
        // get character data - make sure proper data size was read
-       if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
+       if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
        else {
-
+               
                bytesRead += dataLength;
-
-               // clear out bases, qualities, aligned bases, CIGAR, and tag data
+               
+               // clear out any previous string data
+               bAlignment.Name.clear();
                bAlignment.QueryBases.clear();
                bAlignment.Qualities.clear();
                bAlignment.AlignedBases.clear();
                bAlignment.CigarData.clear();
                bAlignment.TagData.clear();
-
+               
                // save name
                bAlignment.Name = (string)((const char*)(allCharData));
                
-               // save bases
-               char singleBase[2];
-               uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
-               for (unsigned int i = 0; i < querySequenceLength; ++i) { 
-                       // numeric to char conversion
-                       sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
-                       // append character data to Bases
-                       bAlignment.QueryBases.append( (const char*)singleBase );
+               // save query sequence
+               for (unsigned int i = 0; i < querySequenceLength; ++i) {        
+                       char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+                       bAlignment.QueryBases.append( 1, singleBase );
                }
                
                // save sequence length
                bAlignment.Length = bAlignment.QueryBases.length();
                
                // save qualities
-               char singleQuality[4];
-               uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
                for (unsigned int i = 0; i < querySequenceLength; ++i) { 
-                       // numeric to char conversion
-                       sprintf( singleQuality, "%c", ( t[i]+33 ) ); 
-                       // append character data to Qualities
-                       bAlignment.Qualities.append( (const char*)singleQuality );
+                       char singleQuality = (char)(qualData[i]+33);                    // conversion from QV to FASTQ character
+                       bAlignment.Qualities.append( 1, singleQuality );
                }
                
                // save CIGAR-related data;
                int k = 0;
                for (unsigned int i = 0; i < numCigarOperations; ++i) {
-
+                       
                        // build CigarOp struct
                        CigarOp op;
                        op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
                        op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
-
+                       
                        // save CigarOp
                        bAlignment.CigarData.push_back(op);
-
-                       // can skip this step if user wants to ignore
-                       if (m_isCalculateAlignedBases) {
-
-                               // build AlignedBases string
-                               switch (op.Type) {
-                                       
-                                       case ('M') : 
-                                       case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
-                                       case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
-                                                                break;
-                                       
-                                       case ('D') : 
-                                       case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;
-                                                                break;
-                                       
-                                       case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
-                                                                k += op.Length;
-                                                                break;
-
-                                       case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
-                                       
-                                       default    : assert(false);     // shouldn't get here
-                                                                break;
-                               }
+                       
+                       // build AlignedBases string
+                       switch (op.Type) {
+                               
+                               case ('M') : 
+                               case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases
+                               case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
+                                                        break;
+                                                        
+                               case ('D') : 
+                               case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;
+                                                        break;
+                                                        
+                               case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
+                                                        k += op.Length;
+                                                        break;
+                                                        
+                               case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op
+                                                        
+                               default    : printf("ERROR: Invalid Cigar op type\n");                  // shouldn't get here
+                                                        exit(1);
                        }
                }
-
+               
                // read in the tag data
                bAlignment.TagData.resize(tagDataLen);
                memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
        }
+
        free(allCharData);
+       return true;
+}
 
-/*
-       // (optional) read tag parsing data
-       string tag;
-       char data;
-       int i = 0;
+void BamReader::LoadReferenceData(void) {
 
-       // still data to read (auxiliary tags)
-       while ( bytesRead < block_len ) {
+       // get number of reference sequences
+       char buffer[4];
+       BgzfRead(buffer, 4);
+       const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
+       if (numberRefSeqs == 0) { return; }
+       m_references.reserve((int)numberRefSeqs);
+       
+       // iterate over all references in header
+       for (unsigned int i = 0; i != numberRefSeqs; ++i) {
 
-               if ( bam_read(m_file, &data, 1) == 1 ) { 
-                       
-                       ++bytesRead;
+               // get length of reference name
+               BgzfRead(buffer, 4);
+               const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
+               char* refName = (char*)calloc(refNameLength, 1);
+               
+               // get reference name and reference sequence length
+               BgzfRead(refName, refNameLength);
+               BgzfRead(buffer, 4);
+               const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
+               
+               // store data for reference
+               RefData aReference;
+               aReference.RefName   = (string)((const char*)refName);
+               aReference.RefLength = refLength;
+               m_references.push_back(aReference);
+               
+               // clean up calloc-ed temp variable
+               free(refName);
+       }
+}
 
-                       if (bytesRead == block_len && data != '\0') {
-                               fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
-                               exit(1);
-                       }
+// opens BAM file (and index)
+void BamReader::Open(const string& filename, const string& indexFilename) {
 
-                       tag.append(1, data);
-                       if ( data == '\0' ) {
-                               bAlignment.Tags.push_back(tag);
-                               tag = "";
-                               i = 0;
-                       } else {
-                               if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
-                               ++i;
-                       }
-               } else {
-                       fprintf(stderr, "ERROR: Could not read tag info.\n");
+       // open the BGZF file for reading, retrieve header text & reference data
+       BgzfOpen(filename);
+       LoadHeaderData();       
+       LoadReferenceData();
+
+       // store file offset of first alignment
+       m_alignmentsBeginOffset = BgzfTell();
+
+       // open index file & load index data (if exists)
+       OpenIndex(indexFilename);
+}
+
+void BamReader::OpenIndex(const string& indexFilename) {
+
+       // if index file exists
+       if (!indexFilename.empty()) {
+
+               // open index
+               FILE* indexStream = fopen(indexFilename.c_str(), "rb");
+               
+               // abort on error
+               if(!indexStream) {
+                       cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl;
                        exit(1);
                }
+       
+               // build up index data structure
+               LoadIndexData(indexStream);
        }
-*/
-       return true;
 }
+
+bool BamReader::Rewind(void) {
+
+       // find first reference that has alignments in the BAM file
+       int refID = 0;
+       int refCount = m_references.size();
+       for ( ; refID < refCount; ++refID ) {
+               if ( m_references.at(refID).RefHasAlignments ) { break; } 
+       }
+
+       // store default bounds for first alignment
+       m_currentRefID = refID;
+       m_currentLeft = 0;
+       m_isRegionSpecified = false;
+
+       // return success/failure of seek
+       return BgzfSeek(m_alignmentsBeginOffset);
+}      
index e2b4598d75292aa95447f0e0599671ad2d4e7db5..17cb86ab5ad09421230cb6cf34741342082562cb 100644 (file)
-// BamReader.h\r
-\r
-/* The MIT License\r
-\r
-   Copyright (c) 2008 Genome Research Ltd (GRL).\r
-\r
-   Permission is hereby granted, free of charge, to any person obtaining\r
-   a copy of this software and associated documentation files (the\r
-   "Software"), to deal in the Software without restriction, including\r
-   without limitation the rights to use, copy, modify, merge, publish,\r
-   distribute, sublicense, and/or sell copies of the Software, and to\r
-   permit persons to whom the Software is furnished to do so, subject to\r
-   the following conditions:\r
-\r
-   The above copyright notice and this permission notice shall be\r
-   included in all copies or substantial portions of the Software.\r
-\r
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
-   SOFTWARE.\r
-*/\r
-\r
-/*\r
-       Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package \r
-        (thus the carryover of above MIT license)\r
-       Contact: Derek Barnett <barnetde@bc.edu>\r
-*/\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 23 April 2009\r
-\r
-#ifndef BAMREADER_H\r
-#define BAMREADER_H\r
-\r
-// custom includes\r
-#include "BamAlignment.h"\r
-#include "STLUtilities.h"\r
-\r
-// C library includes\r
-#include <assert.h>\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#include <string.h>\r
-\r
-#ifdef WIN32\r
-typedef char                 int8_t;\r
-typedef unsigned char       uint8_t;\r
-typedef short               int16_t;\r
-typedef unsigned short     uint16_t;\r
-typedef int                 int32_t;\r
-typedef unsigned int       uint32_t;\r
-typedef long long           int64_t;\r
-typedef unsigned long long uint64_t;\r
-#else\r
-#include <stdint.h>\r
-#endif\r
-\r
-// BGZF library includes/defines\r
-#include "bgzf.h"\r
-typedef BGZF* BamFile;\r
-#define bam_open(f_name, mode)      bgzf_open(f_name, mode)\r
-#define bam_close(f_ptr)            bgzf_close(f_ptr)\r
-#define bam_read(f_ptr, buf, size)  bgzf_read(f_ptr, buf, size)\r
-#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)\r
-#define bam_tell(f_ptr)             bgzf_tell(f_ptr)\r
-#define bam_seek(f_ptr, pos, dir)   bgzf_seek(f_ptr, pos, dir)\r
-\r
-// size of alignment data block in BAM file (bytes)\r
-#define BAM_CORE_SIZE 32\r
-\r
-// BAM indexing constants\r
-#define MAX_BIN           37450 // =(8^6-1)/7+1\r
-#define BAM_MIN_CHUNK_GAP 32768 \r
-#define BAM_LIDX_SHIFT    14\r
-\r
-// CIGAR-retrieval mask/shift constants\r
-#define BAM_CIGAR_SHIFT 4\r
-#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)\r
-\r
-// CIGAR-operation types\r
-#define BAM_CMATCH      0\r
-#define BAM_CINS        1\r
-#define BAM_CDEL        2\r
-#define BAM_CREF_SKIP   3\r
-#define BAM_CSOFT_CLIP  4\r
-#define BAM_CHARD_CLIP  5\r
-#define BAM_CPAD        6\r
-\r
-// --------------------------- //\r
-// Bam header info\r
-// --------------------------- //\r
-\r
-// --------------------------- //\r
-// BamIndex-related typedefs\r
-// --------------------------- //\r
-\r
-// offset for linear indexing\r
-typedef vector<uint64_t> LinearOffsetVector;\r
-\r
-// chunk boundaries\r
-typedef pair<uint64_t, uint64_t> ChunkPair;\r
-// list of chunks in a BAM bin\r
-typedef vector<ChunkPair> ChunkVector;\r
-\r
-// BAM bins for a reference sequence\r
-// replaces khash - uint32_t is key, ChunkVector is value\r
-typedef pair<uint32_t, ChunkVector*> BamBin;\r
-typedef vector<BamBin> BinVector;\r
-\r
-// each reference sequence has a BinVector and LinearOffsetVector\r
-typedef pair<BinVector*, LinearOffsetVector*> RefIndex;\r
-\r
-// full BamIndex defined as: \r
-typedef vector<RefIndex*> BamIndex;\r
-\r
-// ---------------------------------------------------------------------------//\r
-\r
-class BamReader {\r
-       \r
-       public:\r
-               // constructors\r
-               BamReader(const char* fileName = NULL, const char* indexFilename = NULL);\r
-\r
-       public:\r
-               // destructor\r
-               ~BamReader(void);\r
-       \r
-       // BAM interface methods\r
-       public:\r
-\r
-               // ----------------------- //\r
-               // File manipulation\r
-               // ----------------------- //\r
-               \r
-               // open BAM file (automatically opens index if provided)\r
-               bool Open(void);\r
-               \r
-               // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)\r
-               bool OpenIndex(void);\r
-               \r
-               // close BAM file\r
-               bool Close(void);\r
-               \r
-               // get BAM filename\r
-               const char* Filename(void) const;\r
-       \r
-               // set BAM filename\r
-               void SetFilename(const char*);\r
-\r
-               // get BAM Index filename\r
-               const char* IndexFilename(void) const;\r
-               \r
-               // set BAM Index filename\r
-               void SetIndexFilename(const char*);\r
-\r
-               // ----------------------- //\r
-               // Access BAM header\r
-               // ----------------------- //\r
-               \r
-               // return full header text\r
-               const string GetHeaderText(void) const;\r
-               \r
-               // --------------------------------- //\r
-               // Access reference sequence info\r
-               // --------------------------------- //\r
-               \r
-               // return number of reference sequences in BAM file\r
-               const int GetReferenceCount(void) const;\r
-\r
-               // return vector of RefData entries\r
-               const RefVector GetReferenceData(void) const;\r
-\r
-               // get refID from reference name\r
-               const int GetRefID(string refName) const;               \r
-               \r
-               // ----------------------------------------- //\r
-               // File position moving\r
-               // ----------------------------------------- //\r
-\r
-               // jumps to 'left' position on refID\r
-               // actually jumps before position, so reads that overlap 'left' are included as well\r
-               // 'left' defaults to reference begin if not specified\r
-               bool Jump(int refID, unsigned int left = 0);\r
-\r
-               // Jump to beginning of BAM file, clears any region previously set by Jump()\r
-               bool Rewind(void);\r
-               \r
-               // ------------------------------ //\r
-               // Access alignments\r
-               // ------------------------------ //\r
-               \r
-               // get next alignment\r
-               bool GetNextAlignment(BamAlignment& read);\r
-\r
-               // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded\r
-               void SetCalculateAlignedBases(bool);\r
-\r
-       // internal utility methods\r
-       private:\r
-               int      BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);\r
-               uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);\r
-               int64_t  GetOffset(int, unsigned int);\r
-               bool     IsOverlap(BamAlignment&);\r
-               bool     LoadHeader(void);\r
-               bool     LoadIndex(void);\r
-               bool     LoadNextAlignment(BamAlignment&);\r
-\r
-       private:        \r
-               // main BAM reader components\r
-               char*      m_filename;\r
-               char*      m_indexFilename;\r
-               BamFile    m_file;\r
-               BamIndex*  m_index;\r
-               RefVector  m_references;\r
-               string     m_headerText;\r
-\r
-               // state flags\r
-               bool m_isOpen;                           // BAM file is open for processing\r
-               bool m_isIndexLoaded;            // BAM Index data is loaded and available for processing\r
-               bool m_isRegionSpecified;        // a region has been specified - specifically, a user has called Jump()\r
-               bool m_isCalculateAlignedBases;  // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)\r
-\r
-               // region values\r
-               int m_currentRefID;\r
-               unsigned int m_currentLeft;\r
-\r
-               // file offset of 1st read in BAM file\r
-               int64_t m_alignmentsBeginOffset;\r
-\r
-       private:\r
-               // BAM character constants\r
-               static const char* DNA_LOOKUP;\r
-               static const char* CIGAR_LOOKUP;\r
-};\r
-\r
-#endif /* BAMREADER_H */\r
+// ***************************************************************************
+// BamReader (c) 2009 Derek Barnett
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace std;
+
+#include "BamAlignment.h"
+
+// our zlib constants
+#define GZIP_ID1             31
+#define GZIP_ID2            139
+#define CM_DEFLATE            8
+#define FLG_FEXTRA            4
+#define OS_UNKNOWN          255
+#define BGZF_XLEN             6
+#define BGZF_ID1             66
+#define BGZF_ID2             67
+#define BGZF_LEN              2
+#define GZIP_WINDOW_BITS    -15
+#define Z_DEFAULT_MEM_LEVEL   8
+
+// our BZGF constants
+#define BLOCK_HEADER_LENGTH    18
+#define BLOCK_FOOTER_LENGTH     8
+#define MAX_BLOCK_SIZE      65536
+#define DEFAULT_BLOCK_SIZE  65536
+
+// our BAM constants
+#define BAM_CORE_SIZE  32
+#define BAM_CMATCH      0
+#define BAM_CINS        1
+#define BAM_CDEL        2
+#define BAM_CREF_SKIP   3
+#define BAM_CSOFT_CLIP  4
+#define BAM_CHARD_CLIP  5
+#define BAM_CPAD        6
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+
+// BAM indexing constants
+#define MAX_BIN           37450 // =(8^6-1)/7+1
+#define BAM_MIN_CHUNK_GAP 32768 
+#define BAM_LIDX_SHIFT    14
+
+// our variable sizes
+#define SIZEOF_INT 4
+
+// define our BZGF structure
+#ifndef BGZF_DATA
+#define BGZF_DATA
+struct BgzfData {
+       unsigned int UncompressedBlockSize;
+       unsigned int CompressedBlockSize;
+       unsigned int BlockLength;
+       unsigned int BlockOffset;
+       uint64_t BlockAddress;
+       bool IsOpen;
+       FILE* Stream;
+       char* UncompressedBlock;
+       char* CompressedBlock;
+
+       // constructor
+       BgzfData(void)
+               : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+               , CompressedBlockSize(MAX_BLOCK_SIZE)
+               , BlockLength(0)
+               , BlockOffset(0)
+               , BlockAddress(0)
+               , IsOpen(false)
+               , Stream(NULL)
+               , UncompressedBlock(NULL)
+               , CompressedBlock(NULL)
+       {
+               try {
+                       CompressedBlock   = new char[CompressedBlockSize];
+                       UncompressedBlock = new char[UncompressedBlockSize];
+               } catch(bad_alloc&) {
+                       printf("ERROR: Unable to allocate memory for our BGZF object.\n");
+                       exit(1);
+               }
+       }
+
+       // destructor
+       ~BgzfData(void) {
+               if(CompressedBlock)   delete [] CompressedBlock;
+               if(UncompressedBlock) delete [] UncompressedBlock;
+       }
+};
+#endif // BGZF_DATA
+
+
+// --------------------------- //
+// BamIndex-related typedefs
+// --------------------------- //
+
+// offset for linear indexing
+typedef vector<uint64_t> LinearOffsetVector;
+
+// alignment 'chunk' boundaries
+typedef pair<uint64_t, uint64_t> ChunkPair;
+typedef vector<ChunkPair> ChunkVector;
+
+// BAM bins.. each contains (binID, alignment 'chunks')
+typedef pair<uint32_t, ChunkVector*> BamBin;
+typedef vector<BamBin> BinVector;
+
+// each reference sequence has a BinVector and LinearOffsetVector
+typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
+
+// full BamIndex defined as: 
+typedef vector<RefIndex*> BamIndex;
+
+class BamReader {
+
+       // constructor/destructor
+       public:
+               BamReader(void);
+               ~BamReader(void);
+
+       // public interface
+       public:
+               // closes the BAM file
+               void Close(void);
+               // retrieves header text
+               const string GetHeaderText(void) const;
+               // saves the alignment to the alignment archive
+               bool GetNextAlignment(BamAlignment& bAlignment);
+               // return number of reference sequences in BAM file
+               const int GetReferenceCount(void) const;
+               // return vector of RefData entries
+               const RefVector GetReferenceData(void) const;
+               // get refID from reference name
+               const int GetReferenceID(const string& refName) const;
+               // jumps to 'left' position on refID
+               bool Jump(int refID, unsigned int left = 0);
+               // opens the BAM file
+               void Open(const string& filename, const string& indexFilename = "");
+               // move file pointer back to first alignment
+               bool Rewind(void);
+
+       // internal methods
+       private:
+               // checks BGZF block header
+               bool BgzfCheckBlockHeader(char* header);
+               // closes the BAM file
+               void BgzfClose(void);
+               // de-compresses the current block
+               int BgzfInflateBlock(int blockLength);
+               // opens the BAM file for reading
+               void BgzfOpen(const string& filename);
+               // reads BGZF data into a byte buffer
+               unsigned int BgzfRead(char* data, const unsigned int dataLen);
+               // reads BGZF block
+               int BgzfReadBlock(void);
+               // seek to position in BAM file
+               bool BgzfSeek(int64_t position);
+               // get file position in BAM file
+               int64_t BgzfTell(void);
+               // unpacks a buffer into an unsigned int
+               static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
+               // unpacks a buffer into an unsigned short
+               static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
+               //      calculate bins that overlap region ( left to reference end for now )
+               int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
+               // calculates alignment end position based on starting position and provided CIGAR operations
+               unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
+               // clear out (delete pointers in) index data structure
+               void ClearIndex(void);
+               // calculate file offset for first alignment chunk overlapping 'left'
+               int64_t GetOffset(int refID, unsigned int left);
+               // checks to see if alignment overlaps current region
+               bool IsOverlap(BamAlignment& bAlignment);
+               // retrieves header text from BAM file
+               void LoadHeaderData(void);
+               // builds BamIndex data structure from BAM index file
+               void LoadIndexData(FILE* indexStream);
+               // retrieves BAM alignment under file pointer
+               bool LoadNextAlignment(BamAlignment& bAlignment);
+               // builds reference data structure from BAM file
+               void LoadReferenceData(void);
+               // open BAM index file (if successful, loads index)
+               void OpenIndex(const string& indexFilename);
+       
+       // aligment file & index file data
+       private:
+               BgzfData  m_BGZF;
+               string    m_headerText;
+               BamIndex* m_index;
+               RefVector m_references;
+               bool      m_isIndexLoaded;
+               int64_t   m_alignmentsBeginOffset;
+
+       // user-specified region values
+       private:
+               bool         m_isRegionSpecified;
+               int          m_currentRefID;
+               unsigned int m_currentLeft;
+
+       // BAM character constants
+       private:
+               static const char* DNA_LOOKUP;
+               static const char* CIGAR_LOOKUP;
+};
+
+// unpacks a buffer into an unsigned int
+inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
+       union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+       un.valueBuffer[0] = buffer[0];
+       un.valueBuffer[1] = buffer[1];
+       un.valueBuffer[2] = buffer[2];
+       un.valueBuffer[3] = buffer[3];
+       return un.value;
+}
+
+// unpacks a buffer into an unsigned short
+inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
+       union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
+       un.valueBuffer[0] = buffer[0];
+       un.valueBuffer[1] = buffer[1];
+       return un.value;
+}
+
+// allows sorting/searching of a vector of pairs (instead of using maps)
+template <typename Key, typename Value>
+class LookupKeyCompare {
+
+       typedef pair< Key, Value > LookupData;
+       typedef typename LookupData::first_type Key_t;
+       
+       public:
+               bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
+               bool operator() (const LookupData& lhs, const Key_t& k) const        { return keyLess(lhs.first, k); }
+               bool operator() (const Key_t& k, const LookupData& rhs) const        { return keyLess(k, rhs.first); }
+       private:
+               bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
+};
+
+// return index of item if found, else return container.size()
+template < typename InputIterator, typename EqualityComparable >
+typename iterator_traits<InputIterator>::difference_type
+Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
+       return distance(begin, find(begin, end, item));
+}
+
index 5426763a76c20b9fb03ba95a36f135f35284a62a..d568fed2e86f3458647254f12907883283e596ad 100644 (file)
@@ -2,7 +2,7 @@
 \r
 // Derek Barnett\r
 // Marth Lab, Boston College\r
-// Last modified: 6 April 2009\r
+// Last modified: 12 May 2009\r
 \r
 #include "BamReader.h"\r
 \r
@@ -32,221 +32,73 @@ int main(int argc, char* argv[]) {
        const char* bamFilename = argv[1];\r
        const char* bamIndexFilename = argv[2];\r
 \r
-       string refName;\r
-       int refID;\r
-\r
-       int alignmentCount;\r
-\r
-       vector<string> refNames;\r
-\r
+       int alignmentsRead = 0;\r
        BamAlignment bAlignment;\r
-       BamAlignmentVector alignments;\r
-\r
-       RefVector references;\r
-\r
-       // ------------------------------------------------------------------------------------------------------ //\r
-       // Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information\r
-       // ------------------------------------------------------------------------------------------------------ //\r
        \r
-       BamReader bReader(bamFilename, bamIndexFilename);\r
+       BamReader bReader;\r
+       cerr << endl << "Opening BAM file (and index)" << endl << endl;\r
+       bReader.Open(bamFilename, bamIndexFilename);\r
 \r
-       cerr << endl;\r
-       cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... ";\r
-       \r
-       if ( bReader.Open() ) { \r
-               cerr << "ok" << endl; \r
-       }\r
-       else {\r
-               cerr << "error" << endl;\r
-               exit(1);\r
+       string header = bReader.GetHeaderText();\r
+       cerr << "Printing header text..." << endl;\r
+       if ( header.empty() ) {\r
+               cerr << "No header provided." << endl << endl;\r
+       } else {\r
+               cerr << "----------------------" << endl;\r
+               cerr << "Header Text: " << endl;\r
+               cerr << "----------------------" << endl;\r
+               cerr << header << endl << endl;\r
        }\r
-       \r
-       // ------------------------------------------------------------ //\r
-       // Find out how many reference sequences are in BAM file\r
-       // ------------------------------------------------------------ //\r
-       \r
-       references = bReader.GetReferenceData();\r
 \r
-       cerr << endl;\r
-       cerr << "Total number of ref seqs: " << references.size() << endl;\r
-       \r
-       // ---------------------------------------------------------------------------- //\r
-       // Get the names/lengths of all the reference sequences that have alignments\r
-       // ---------------------------------------------------------------------------- //\r
-       \r
-       cerr << endl;\r
-       cerr << "All ref sequences with alignments:" << endl;\r
+       RefVector references = bReader.GetReferenceData();\r
+       cerr << "Printing references..." << endl;\r
+       RefVector::const_iterator refIter = references.begin();\r
+       RefVector::const_iterator refEnd  = references.end();\r
+       for ( ; refIter != refEnd; ++refIter) {\r
+               cerr << "Reference entry: " << endl;\r
+               cerr << (*refIter).RefName << endl;\r
+               cerr << (*refIter).RefLength << endl;\r
+               cerr << "Has alignments? : " << ( ((*refIter).RefHasAlignments) ? "yes" : "no" ) << endl;\r
+       }\r
        cerr << endl;\r
 \r
+       alignmentsRead = 0;\r
+       while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
+               cerr << "Alignment " << alignmentsRead << endl;\r
+               cerr << bAlignment.Name << endl;\r
+               cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
+               ++alignmentsRead;\r
+       }\r
 \r
-       if ( !references.empty() ) {\r
-               \r
-               RefVector::iterator refIter = references.begin();\r
-               RefVector::iterator refEnd  = references.end();\r
-               \r
-               refID = 0;\r
-               // iterate over reference names, print to STDERR\r
-               for( ; refIter != refEnd; ++refIter) {\r
-\r
-                       if ( (*refIter).RefHasAlignments ) {\r
-                               cerr << "ID:     " << refID << endl;\r
-                               cerr << "Name:   " << (*refIter).RefName << endl;\r
-                               cerr << "Length: " << (*refIter).RefLength << endl;\r
-                               cerr << endl;\r
+       cerr << "Jumping in BAM file" << endl;\r
+       if ( bReader.Jump(1, 5000) ) {\r
+               cerr << "Jumping seems ok - getting first 10 alignments that start at or after chr2:5000" << endl;\r
+\r
+               alignmentsRead = 0;\r
+               while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
+                       if ( bAlignment.Position >= 5000 ) { \r
+                               cerr << "Alignment " << alignmentsRead << endl;\r
+                               cerr << bAlignment.Name << endl;\r
+                               cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
+                               ++alignmentsRead;\r
                        }\r
-\r
-                       ++refID;\r
                }\r
        }\r
 \r
-       // --------------------------------------------------------------------------------------------- //\r
-       // Get the SAM-style header text, if available (not available in the example file shown here)\r
-       // --------------------------------------------------------------------------------------------- //     \r
-       \r
-       cerr << "------------------------------------" << endl;\r
-       cerr << "SAM header text: " << endl;\r
-       \r
-       // get (SAM) header text\r
-       string header = bReader.GetHeaderText();\r
-       \r
-       cerr << ( (header.empty()) ? "no header data" : header ) << endl;\r
-       cerr << "------------------------------------" << endl;\r
-\r
-       // --------------------------------------------------------------------------------------------- //\r
-       // Here we start accessing alignments\r
-       // The first method shows how to iterate over all reads in a BamFile\r
-       // This method will work on any BAM file (sorted/non-sorted, with/without an index)\r
-       // --------------------------------------------------------------------------------------------- //\r
-\r
-       // Call Rewind() to make sure you're at the 1st alignment in the file\r
-       // Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment\r
-       // but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code\r
-\r
-       cerr << "Getting (up to) first 1000 alignments" << endl;\r
-\r
-       // start at 1st alignment\r
+       cerr << "Rewinding BAM file" << endl;\r
        if ( bReader.Rewind() ) {\r
+               cerr << "Rewind seems to be ok - getting first 10 alignments" << endl;\r
                \r
-               alignmentCount = 0;\r
-               while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
-               \r
-                       // disregard unmapped alignments\r
-                       if ( bAlignment.IsMapped() ) {\r
-\r
-                               ++alignmentCount;\r
-                               \r
-                               cout << "----------------------------" << endl;\r
-                               cout << "Alignment " << alignmentCount << endl;\r
-                               cout << bAlignment.Name << endl;\r
-                               cout << bAlignment.AlignedBases << endl;\r
-                               cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
-                               \r
-                               cout << "Cigar Data: " << endl;\r
-\r
-                               vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
-                               vector<CigarOp>::const_iterator cigarEnd  = bAlignment.CigarData.end();\r
-                               for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
-                                       cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl;\r
-                               }\r
-\r
-                               if(!bAlignment.TagData.empty()) {\r
-                                 cout << "Tag data is present." << endl;\r
-                                 string readGroup;\r
-                                 if(bAlignment.GetReadGroup(readGroup)) {\r
-                                   cout << "- read group: " << readGroup << endl;\r
-                                 }\r
-                               }\r
-                       }\r
+               alignmentsRead = 0;\r
+               while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
+                       cerr << "Alignment " << alignmentsRead << endl;\r
+                       cerr << bAlignment.Name << endl;\r
+                       cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
+                       ++alignmentsRead;\r
                }\r
-\r
-               cerr << "Found " << alignmentCount << " alignments." << endl;\r
-       } else { cerr << "Could not rewind" << endl; }\r
-\r
-       // ---------------------------------------------------------------------------------------------------------- //\r
-       // You can iterate over each individual alignment that overlaps a specified region\r
-       // Set the refID & left boundary parameters using Jump(refID, left)\r
-       // Jump() actually positions the file pointer actually before the left boundary\r
-       // This ensures that reads beginning before, but overlapping 'left' are included as well \r
-       // Client is responsible for setting/checking right boundary - see below\r
-       // ---------------------------------------------------------------------------------------------------------- //\r
-/*\r
-       cerr << "Jumping to region" << endl;\r
-\r
-       // get refID using a known refName\r
-       refName = "seq1";\r
-       refID = bReader.GetRefID(refName);\r
-       if (refID == (int)references.size()) { \r
-               cerr << "Reference " << refName << " not found." << endl;\r
-               exit(1);\r
        }\r
 \r
-       // set left boundary\r
-       unsigned int leftBound = 500;\r
-\r
-       // set right boundary - either user-specified number to set a discrete region\r
-       //                      OR you can query the BamReader for the end of the reference\r
-       unsigned int rightBound = references.at(refID).RefLength;\r
-\r
-       cerr << endl;\r
-       cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl;\r
-\r
-       // set region - specific region on reference\r
-       if ( bReader.Jump(refID, leftBound) ) { \r
-\r
-               alignmentCount = 0;\r
-               while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) {\r
-               \r
-                       if ( bAlignment.IsMapped() ) {\r
-\r
-                               ++alignmentCount;\r
-                       \r
-                               if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; }\r
-                       \r
-                               cout << "----------------------------" << endl;\r
-                               cout << "Alignment " << alignmentCount << endl;\r
-                               cout << bAlignment.Name << endl;\r
-                               cout << bAlignment.AlignedBases << endl;\r
-                               cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
-                       }\r
-               }\r
-\r
-               cerr << "Found " << alignmentCount << " alignments." << endl; \r
-       } else { cerr << "Could not jump to region specified" << endl; }\r
-\r
-       // ----------------------------------------------------------------------------------------------------- //\r
-       // You can 'rewind' back to beginning of BAM file at any time \r
-       // ----------------------------------------------------------------------------------------------------- //\r
-\r
-       cerr << endl;\r
-       cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl;\r
-\r
-       alignmentCount = 0;\r
-       if (bReader.Rewind() ) {\r
-               while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
-               \r
-                       // disregard unmapped alignments\r
-                       if ( bAlignment.IsMapped() ) {\r
-\r
-                               ++alignmentCount;\r
-                       \r
-                               cout << "----------------------------" << endl;\r
-                               cout << "Alignment " << alignmentCount << endl;\r
-                               cout << bAlignment.Name << endl;\r
-                               cout << bAlignment.AlignedBases << endl;\r
-                               cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
-                       }\r
-               }\r
-\r
-               cerr << "Found " << alignmentCount << " alignments." << endl;\r
-       } else { cerr << "Could not rewind" << endl; }\r
-*/\r
-       // ---------------------------------------------------------------------- //\r
-       // Close BamReader object (releases internal header/index data) and exit\r
-       // ---------------------------------------------------------------------- //\r
-       \r
-       cerr << endl;\r
-       cerr << "Closing BAM file: " << bamFilename << endl;\r
-       \r
+       cerr << "Closing BAM file..." << endl << endl;\r
        bReader.Close();\r
 \r
        cerr << "Exiting..." << endl << endl;\r
index d375612958309c2396ebf1dcc47fc4dfa7da4890..18f8e0a08e2ab7e9f2c5cb70fff2263e3ba50c87 100644 (file)
@@ -57,6 +57,8 @@ using namespace std;
 #define SIZEOF_INT 4
 
 // define our BZGF structure
+#ifndef BGZF_DATA
+#define BGZF_DATA
 struct BgzfData {
        unsigned int UncompressedBlockSize;
        unsigned int CompressedBlockSize;
@@ -95,6 +97,7 @@ struct BgzfData {
                if(UncompressedBlock) delete [] UncompressedBlock;
        }
 };
+#endif // BGZF_DATA
 
 class BamWriter {
 public:
index 8862226d50cf01812083549ca1d71f41dc29cde0..40e3ecdfe9afcd48a0adad9964a334c3c3245276 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,39 +1,18 @@
-CC=            gcc\r
 CXX=           g++\r
-CFLAGS=                -Wall -O3\r
-CXXFLAGS=      $(CFLAGS)\r
-DFLAGS=                -D_IOLIB=2 #-D_NDEBUG\r
-OBJS=          BamReader.o bgzf.o\r
-PROG=          BamReaderTest\r
-INCLUDES=       \r
-ARFLAGS=       -crs\r
+CXXFLAGS=      -Wall -O3\r
+PROG=          BamReaderTest BamConversion\r
 LIBS=          -lz\r
-SUBDIRS=       .\r
 \r
-.SUFFIXES:.c .cpp .o\r
+all: $(PROG)\r
 \r
-.c.o:\r
-               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@\r
+BamReaderTest: BamReader.o BamReaderMain.o\r
+       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamReaderMain.o $(LIBS)\r
 \r
-.cpp.o:\r
-               $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@\r
-\r
-all: $(PROG) BamConversion\r
-\r
-lib:libbambc.a\r
-\r
-libbambc.a:$(OBJS)\r
-               $(AR) $(ARFLAGS) $@ $(OBJS)\r
-\r
-BamReaderTest:lib BamReaderMain.o\r
-               $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc\r
-\r
-BamConversion: lib BamWriter.o BamConversionMain.o\r
-       $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc\r
-\r
-BamMerge: lib BamMerge.o\r
-               $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamMerge.o $(LIBS) -L. -lbambc\r
+BamConversion: BamReader.o BamWriter.o BamConversionMain.o\r
+       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamConversionMain.o $(LIBS)\r
 \r
+BamMerge: BamReader.o BamWriter.o BamMerge.o\r
+       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamMerge.o $(LIBS)\r
 \r
 clean:\r
-               rm -fr gmon.out *.o *.a a.out $(PROG) BamConversion *~\r
+               rm -fr gmon.out *.o *.a a.out *~\r
diff --git a/README b/README
index 639bfd107d103185d08973bd77cf79a736271233..9553d1ac96ecee5acef3bce6235edf4cc1c6e04e 100644 (file)
--- a/README
+++ b/README
@@ -2,10 +2,27 @@
 README
 ----------------
 
-Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory.
 
-When you compile your program, add this to the g++ command:
+Significant re-write of BamReader API as of 5/18/2009:
+-------------------------------------------------------
+No longer using bgzf.h/.c
+No longer using libbambc.a library
+No longer using STL Utilities
+Minor changes to API syntax in opening BAM files with BamReader - see BamReader.h for details
 
-           -lz -L. -lbambc
 
-and you should be good to go.  (Those are lower case l's, as in 'lion'.)
+Compile instructions
+----------------------
+Short version:
+See Makefile for compilation example.
+
+Longer version:
+Copy BamAlignment.h, BamReader.* and/or BamWriter.* to working directory.
+#include "BamReader.h" and/or "BamWriter.h"
+Use standard compile commands, including the "-lz" parameter to access zlib functionality.
+(Note that "-L. -lbambc" are no longer necessary and WILL CAUSE PROBLEMS for compiler/linker )
+
+
+Feel free to email me or pop by my desk with any questions, comments, suggestions, etc.
+  - Derek 
+
diff --git a/STLUtilities.h b/STLUtilities.h
deleted file mode 100644 (file)
index 7a9fbc5..0000000
+++ /dev/null
@@ -1,146 +0,0 @@
-// STLUtilities.h\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 19 March 2009\r
-\r
-#ifndef STL_UTILITIES_H\r
-#define STL_UTILITIES_H\r
-\r
-#include <algorithm>\r
-using std::distance;\r
-using std::find;\r
-using std::transform;\r
-\r
-#include <functional>\r
-using std::unary_function;\r
-\r
-#include <iterator>\r
-using std::iterator_traits;\r
-\r
-#include <string>\r
-using std::string;\r
-\r
-#include <utility>\r
-using std::pair;\r
-\r
-#include <vector>\r
-using std::vector;\r
-\r
-// -------------------------------------------------------------------------------- //\r
-// This file contains a sample of STL tricks I've gathered along the way.\r
-//\r
-// Many thanks throughout to 'Effective' series by Scott Meyers.\r
-// Some code is lifted (almost) verbatim from his texts where applicable.\r
-// ------------------------------------------------------------------------------- //\r
-\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-// STL containers will delete the values they hold when the container is deleted or goes out of scope\r
-// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects ***\r
-// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new')\r
-// Usage example:    for_each( container.begin(), container.end(), DeleteObject() ) \r
-//\r
-// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson   =)\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-struct DeleteObject {\r
-       template<typename T>\r
-       void operator() (const T* p) const { \r
-               delete p; \r
-       }\r
-};\r
-\r
-template<typename T>\r
-void ClearPointerVector(vector<T>& v) {\r
-       if ( !v.empty() ) {\r
-               for_each(v.begin(), v.end(), DeleteObject());\r
-               v.clear();\r
-       }\r
-}\r
-\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-// Query a vector (other containers? havent tried) for an element\r
-// Returns the index of that element\r
-// Returns vector::size() if not found\r
-// Works with reverse iterators as well... index is counted backward from last element though\r
-// --------------------------------------------------------------------------------------------------------------------------------- //\r
-template < typename InputIterator, typename EqualityComparable >\r
-typename iterator_traits<InputIterator>::difference_type\r
-Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {\r
-       return distance(begin, find(begin, end, item));\r
-}\r
-\r
-// ----------------------------------------------------------------------------------------------------------------------//\r
-// This next section is a sort of work-around for the bulky associative maps\r
-// The idea is to use a *SORTED* vector of pair objects as a lookup vector\r
-// LookupVector = vector< pair<Key, Value> > \r
-// ----------------------------------------------------------------------------------------------------------------------//\r
-\r
-// The following template classes allow a templatized comparison function for sorting & searching lookup vector\r
-\r
-// allows sorting by Key ( less<Key> )\r
-template <typename Key, typename Value>\r
-class LookupKeyCompare {\r
-\r
-       typedef pair< Key, Value > LookupData;\r
-       typedef typename LookupData::first_type Key_t;\r
-       \r
-       public:\r
-               bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }\r
-               bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }\r
-               bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }\r
-       private:\r
-               bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }\r
-};\r
-\r
-// allows sorting by Value ( less<Value> )\r
-template <typename Key, typename Value>\r
-class LookupValueCompare {\r
-\r
-       typedef pair< Key, Value > LookupData;\r
-       typedef typename LookupData::second_type Value_t;\r
-       \r
-       public:\r
-               bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); }\r
-               bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); }\r
-               bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); }\r
-       private:\r
-               bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; }\r
-};\r
-\r
-// The following template functions/structs allow you to pull all keys or all values from this lookup structure\r
-\r
-// pull Key from a data pair\r
-template<typename Key, typename Value>\r
-struct RipKey:\r
-       public unary_function< pair<Key,Value>, Key> {\r
-               Key operator() (pair<Key,Value> dataPair) { \r
-                       return dataPair.first; \r
-               }\r
-};\r
-\r
-// pull Value from a data pair\r
-template<typename Key, typename Value>\r
-struct RipValue:\r
-       public unary_function< pair<Key,Value>, Value> {\r
-               Value operator() (pair<Key,Value> dataPair) { \r
-                       return dataPair.second; \r
-               }\r
-};\r
-\r
-// pull all Keys from lookup, store in dest (overwrite contents of dest)\r
-template <typename Key, typename Value>\r
-void RipKeys( vector< pair<Key,Value> >& lookup, vector<Key>& dest) {\r
-       dest.clear();\r
-       dest.reserve(lookup.size());\r
-       transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey<Key,Value>());\r
-}\r
-\r
-// pull all Values from lookup, store in dest (overwrite contents of dest)\r
-template <typename Key, typename Value>\r
-void RipValues( vector< pair<Key,Value> >& lookup, vector<Value>& dest) {\r
-       dest.clear();\r
-       dest.reserve(lookup.size());\r
-       transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue<Key,Value>());\r
-}\r
-\r
-#endif /* STL_UTILITIES_H */\r
diff --git a/bgzf.c b/bgzf.c
deleted file mode 100644 (file)
index 6481ee3..0000000
--- a/bgzf.c
+++ /dev/null
@@ -1,493 +0,0 @@
-/*
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2008 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever.
- * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
- * or functionality.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "bgzf.h"
-
-#ifdef WIN32
-
-#else
-#include <unistd.h>
-extern off_t ftello(FILE *stream);
-extern int fseeko(FILE *stream, off_t offset, int whence);
-#endif
-
-typedef int8_t byte;
-
-static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
-static const int MAX_BLOCK_SIZE = 64 * 1024;
-
-static const int BLOCK_HEADER_LENGTH = 18;
-static const int BLOCK_FOOTER_LENGTH = 8;
-
-static const int GZIP_ID1 = 31;
-static const int GZIP_ID2 = 139;
-static const int CM_DEFLATE = 8;
-static const int FLG_FEXTRA = 4;
-static const int OS_UNKNOWN = 255;
-static const int BGZF_ID1 = 66; // 'B'
-static const int BGZF_ID2 = 67; // 'C'
-static const int BGZF_LEN = 2;
-static const int BGZF_XLEN = 6; // BGZF_LEN+4
-
-static const int GZIP_WINDOW_BITS = -15; // no zlib header
-static const int Z_DEFAULT_MEM_LEVEL = 8;
-
-
-inline
-void
-packInt16(uint8_t* buffer, uint16_t value)
-{
-    buffer[0] = value;
-    buffer[1] = value >> 8;
-}
-
-inline
-int
-unpackInt16(const uint8_t* buffer)
-{
-    return (buffer[0] | (buffer[1] << 8));
-}
-
-inline
-void
-packInt32(uint8_t* buffer, uint32_t value)
-{
-    buffer[0] = value;
-    buffer[1] = value >> 8;
-    buffer[2] = value >> 16;
-    buffer[3] = value >> 24;
-}
-
-inline
-int
-min(int x, int y)
-{
-    return (x < y) ? x : y;
-}
-
-static
-void
-report_error(BGZF* fp, const char* message) {
-    fp->error = message;
-}
-
-static
-BGZF*
-open_read(int fd)
-{
-    FILE* file = fdopen(fd, "r");
-    BGZF* fp = (BGZF*)malloc(sizeof(BGZF));
-    fp->file_descriptor = fd;
-    fp->open_mode = 'r';
-    fp->owned_file = 0;
-    fp->file = file;
-    fp->uncompressed_block_size = MAX_BLOCK_SIZE;
-    fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
-    fp->compressed_block_size = MAX_BLOCK_SIZE;
-    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
-    fp->block_address = 0;
-    fp->block_offset = 0;
-    fp->block_length = 0;
-    fp->error = NULL;
-    return fp;
-}
-
-static
-BGZF*
-open_write(int fd)
-{
-    FILE* file = fdopen(fd, "w");
-    BGZF* fp = (BGZF*)malloc(sizeof(BGZF));
-    fp->file_descriptor = fd;
-    fp->open_mode = 'w';
-    fp->owned_file = 0;
-    fp->file = file;
-    fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
-    fp->uncompressed_block = NULL;
-    fp->compressed_block_size = MAX_BLOCK_SIZE;
-    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
-    fp->block_address = 0;
-    fp->block_offset = 0;
-    fp->block_length = 0;
-    fp->error = NULL;
-    return fp;
-}
-
-BGZF*
-bgzf_open(const char* __restrict path, const char* __restrict mode)
-{
-    BGZF* fp = NULL;
-    if (strcasecmp(mode, "r") == 0) {
-       int oflag = O_RDONLY;
-       int fd = open(path, oflag);
-        fp = open_read(fd);
-    } else if (strcasecmp(mode, "w") == 0) {
-       int oflag = O_WRONLY | O_CREAT | O_TRUNC;
-       int fd = open(path, oflag, 0644);
-        fp = open_write(fd);
-    }
-    if (fp != NULL) {
-        fp->owned_file = 1;
-    }
-    return fp;
-}
-
-BGZF*
-bgzf_fdopen(int fd, const char * __restrict mode)
-{
-    if (strcasecmp(mode, "r") == 0) {
-        return open_read(fd);
-    } else if (strcasecmp(mode, "w") == 0) {
-        return open_write(fd);
-    } else {
-        return NULL;
-    }
-}
-
-static
-int
-deflate_block(BGZF* fp, int block_length)
-{
-    // Deflate the block in fp->uncompressed_block into fp->compressed_block.
-    // Also adds an extra field that stores the compressed block length.
-
-    byte* buffer = (byte*)fp->compressed_block;
-    int buffer_size = fp->compressed_block_size;
-
-    // Init gzip header
-    buffer[0] = GZIP_ID1;
-    buffer[1] = GZIP_ID2;
-    buffer[2] = CM_DEFLATE;
-    buffer[3] = FLG_FEXTRA;
-    buffer[4] = 0; // mtime
-    buffer[5] = 0;
-    buffer[6] = 0;
-    buffer[7] = 0;
-    buffer[8] = 0;
-    buffer[9] = OS_UNKNOWN;
-    buffer[10] = BGZF_XLEN;
-    buffer[11] = 0;
-    buffer[12] = BGZF_ID1;
-    buffer[13] = BGZF_ID2;
-    buffer[14] = BGZF_LEN;
-    buffer[15] = 0;
-    buffer[16] = 0; // placeholder for block length
-    buffer[17] = 0;
-
-    // loop to retry for blocks that do not compress enough
-    int input_length = block_length;
-    int compressed_length = 0;
-    while (1) {
-
-        z_stream zs;
-        zs.zalloc = NULL;
-        zs.zfree = NULL;
-        zs.next_in = (Bytef*)fp->uncompressed_block;
-        zs.avail_in = input_length;
-        zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
-        zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
-
-        int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
-                                  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
-        if (status != Z_OK) {
-            report_error(fp, "deflate init failed");
-            return -1;
-        }
-        status = deflate(&zs, Z_FINISH);
-        if (status != Z_STREAM_END) {
-            deflateEnd(&zs);
-            if (status == Z_OK) {
-                // Not enough space in buffer.
-                // Can happen in the rare case the input doesn't compress enough.
-                // Reduce the amount of input until it fits.
-                input_length -= 1024;
-                if (input_length <= 0) {
-                    // should never happen
-                    report_error(fp, "input reduction failed");
-                    return -1;
-                }
-                continue;
-            }
-            report_error(fp, "deflate failed");
-            return -1;
-        }
-        status = deflateEnd(&zs);
-        if (status != Z_OK) {
-            report_error(fp, "deflate end failed");
-            return -1;
-        }
-        compressed_length = zs.total_out;
-        compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
-        if (compressed_length > MAX_BLOCK_SIZE) {
-            // should never happen
-            report_error(fp, "deflate overflow");
-            return -1;
-        }
-        break;
-    }
-
-    packInt16((uint8_t*)&buffer[16], compressed_length-1);
-    uint32_t crc = crc32(0L, NULL, 0L);
-    crc = crc32(crc, (Bytef*)fp->uncompressed_block, input_length);
-    packInt32((uint8_t*)&buffer[compressed_length-8], crc);
-    packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
-
-    int remaining = block_length - input_length;
-    if (remaining > 0) {
-        if (remaining > input_length) {
-            // should never happen (check so we can use memcpy)
-            report_error(fp, "remainder too large");
-            return -1;
-        }
-        memcpy(fp->uncompressed_block,
-               (char*)fp->uncompressed_block + input_length,
-               remaining);
-    }
-    fp->block_offset = remaining;
-    return compressed_length;
-}
-
-static
-int
-inflate_block(BGZF* fp, int block_length)
-{
-    // Inflate the block in fp->compressed_block into fp->uncompressed_block
-
-    z_stream zs;
-    zs.zalloc = NULL;
-    zs.zfree = NULL;
-    zs.next_in = (Bytef*)fp->compressed_block + 18;
-    zs.avail_in = block_length - 16;
-    zs.next_out = (Bytef*)fp->uncompressed_block;
-    zs.avail_out = fp->uncompressed_block_size;
-
-    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
-    if (status != Z_OK) {
-        report_error(fp, "inflate init failed");
-        return -1;
-    }
-    status = inflate(&zs, Z_FINISH);
-    if (status != Z_STREAM_END) {
-        inflateEnd(&zs);
-        report_error(fp, "inflate failed");
-        return -1;
-    }
-    status = inflateEnd(&zs);
-    if (status != Z_OK) {
-        report_error(fp, "inflate failed");
-        return -1;
-    }
-    return zs.total_out;
-}
-
-static
-int
-check_header(const byte* header)
-{
-    return (header[0] == GZIP_ID1 &&
-            header[1] == (byte) GZIP_ID2 &&
-            header[2] == Z_DEFLATED &&
-            (header[3] & FLG_FEXTRA) != 0 &&
-            unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
-            header[12] == BGZF_ID1 &&
-            header[13] == BGZF_ID2 &&
-            unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
-}
-
-static
-int
-read_block(BGZF* fp)
-{
-    byte header[BLOCK_HEADER_LENGTH];
-    int64_t block_address = ftello(fp->file);
-    int count = fread(header, 1, sizeof(header), fp->file);
-    if (count == 0) {
-        fp->block_length = 0;
-        return 0;
-    }
-    if (count != sizeof(header)) {
-        report_error(fp, "read failed");
-        return -1;
-    }
-    if (!check_header(header)) {
-        report_error(fp, "invalid block header");
-        return -1;
-    }
-    int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
-    byte* compressed_block = (byte*) fp->compressed_block;
-    memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
-    int remaining = block_length - BLOCK_HEADER_LENGTH;
-    count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
-    if (count != remaining) {
-        report_error(fp, "read failed");
-        return -1;
-    }
-    count = inflate_block(fp, block_length);
-    if (count < 0) {
-        return -1;
-    }
-    if (fp->block_length != 0) {
-        // Do not reset offset if this read follows a seek.
-        fp->block_offset = 0;
-    }
-    fp->block_address = block_address;
-    fp->block_length = count;
-    return 0;
-}
-
-int
-bgzf_read(BGZF* fp, void* data, int length)
-{
-    if (length <= 0) {
-        return 0;
-    }
-    if (fp->open_mode != 'r') {
-        report_error(fp, "file not open for reading");
-        return -1;
-    }
-
-    int bytes_read = 0;
-    byte* output = (byte*)data;
-    while (bytes_read < length) {
-        int available = fp->block_length - fp->block_offset;
-        if (available <= 0) {
-            if (read_block(fp) != 0) {
-                return -1;
-            }
-            available = fp->block_length - fp->block_offset;
-            if (available <= 0) {
-                break;
-            }
-        }
-        int copy_length = min(length-bytes_read, available);
-        byte* buffer = (byte*)fp->uncompressed_block;
-        memcpy(output, buffer + fp->block_offset, copy_length);
-        fp->block_offset += copy_length;
-        output += copy_length;
-        bytes_read += copy_length;
-    }
-    if (fp->block_offset == fp->block_length) {
-        fp->block_address = ftello(fp->file);
-        fp->block_offset = 0;
-        fp->block_length = 0;
-    }
-    return bytes_read;
-}
-
-static
-int
-flush_block(BGZF* fp)
-{
-    while (fp->block_offset > 0) {
-        int block_length = deflate_block(fp, fp->block_offset);
-        if (block_length < 0) {
-            return -1;
-        }
-        int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
-        if (count != block_length) {
-            report_error(fp, "write failed");
-            return -1;
-        }
-        fp->block_address += block_length;
-    }
-    return 0;
-}
-
-int
-bgzf_write(BGZF* fp, const void* data, int length)
-{
-    if (fp->open_mode != 'w') {
-        report_error(fp, "file not open for writing");
-        return -1;
-    }
-
-    if (fp->uncompressed_block == NULL) {
-        fp->uncompressed_block = malloc(fp->uncompressed_block_size);
-    }
-
-    const byte* input = (byte*)data;
-    int block_length = fp->uncompressed_block_size;
-    int bytes_written = 0;
-    while (bytes_written < length) {
-        int copy_length = min(block_length - fp->block_offset, length - bytes_written);
-        byte* buffer = (byte*)fp->uncompressed_block;
-        memcpy(buffer + fp->block_offset, input, copy_length);
-        fp->block_offset += copy_length;
-        input += copy_length;
-        bytes_written += copy_length;
-        if (fp->block_offset == block_length) {
-            if (flush_block(fp) != 0) {
-                break;
-            }
-        }
-    }
-    return bytes_written;
-}
-
-int
-bgzf_close(BGZF* fp)
-{
-    if (fp->open_mode == 'w') {
-        if (flush_block(fp) != 0) {
-            return -1;
-        }
-        if (fflush(fp->file) != 0) {
-            report_error(fp, "flush failed");
-            return -1;
-        }
-    }
-    if (fp->owned_file) {
-        if (fclose(fp->file) != 0) {
-            return -1;
-        }
-    }
-    free(fp->uncompressed_block);
-    free(fp->compressed_block);
-    free(fp);
-    return 0;
-}
-
-int64_t
-bgzf_tell(BGZF* fp)
-{
-    return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
-}
-
-int64_t
-bgzf_seek(BGZF* fp, int64_t pos, int where)
-{
-    if (fp->open_mode != 'r') {
-        report_error(fp, "file not open for read");
-        return -1;
-    }
-    if (where != SEEK_SET) {
-        report_error(fp, "unimplemented seek option");
-        return -1;
-    }
-    int block_offset = pos & 0xFFFF;
-    int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
-    if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
-        report_error(fp, "seek failed");
-        return -1;
-    }
-    fp->block_length = 0;  // indicates current block is not loaded
-    fp->block_address = block_address;
-    fp->block_offset = block_offset;
-    return 0;
-}
-
diff --git a/bgzf.h b/bgzf.h
deleted file mode 100644 (file)
index 080db1d..0000000
--- a/bgzf.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2008 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever.
- * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
- * or functionality.
- */
-
-#ifndef __BGZF_H
-#define __BGZF_H
-
-#include <stdio.h>
-#include "zlib.h"
-
-#ifdef WIN32
-#include <io.h>
-
-typedef char                 int8_t;
-typedef unsigned char       uint8_t;
-typedef short               int16_t;
-typedef unsigned short     uint16_t;
-typedef int                 int32_t;
-typedef unsigned int       uint32_t;
-typedef long long           int64_t;
-typedef unsigned long long uint64_t;
-
-#define ftello(a)     _ftelli64(a)
-#define fseeko(a,b,c) _fseeki64(a,b,c)
-#define strcasecmp    _stricmp
-#define open          _open
-#define fdopen        _fdopen
-#else
-#include <stdint.h>
-#include <stdbool.h>
-#endif
-
-typedef struct {
-    int file_descriptor;
-    char open_mode;  // 'r' or 'w'
-    bool owned_file;
-    FILE* file;
-    int uncompressed_block_size;
-    int compressed_block_size;
-    void* uncompressed_block;
-    void* compressed_block;
-    int64_t block_address;
-    int block_length;
-    int block_offset;
-    const char* error;
-} BGZF;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Open an existing file descriptor for reading or writing.
- * Mode must be either "r" or "w".
- * A subsequent bgzf_close will not close the file descriptor.
- * Returns null on error.
- */
-BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
-
-/*
- * Open the specified file for reading or writing.
- * Mode must be either "r" or "w".
- * Returns null on error.
- */
-BGZF* bgzf_open(const char* path, const char* __restrict mode);
-
-/*
- * Close the BGZ file and free all associated resources.
- * Does not close the underlying file descriptor if created with bgzf_fdopen.
- * Returns zero on success, -1 on error.
- */
-int bgzf_close(BGZF* fp);
-
-/*
- * Read up to length bytes from the file storing into data.
- * Returns the number of bytes actually read.
- * Returns zero on end of file.
- * Returns -1 on error.
- */
-int bgzf_read(BGZF* fp, void* data, int length);
-
-/*
- * Write length bytes from data to the file.
- * Returns the number of bytes written.
- * Returns -1 on error.
- */
-int bgzf_write(BGZF* fp, const void* data, int length);
-
-/*
- * Return a virtual file pointer to the current location in the file.
- * No interpetation of the value should be made, other than a subsequent
- * call to bgzf_seek can be used to position the file at the same point.
- * Return value is non-negative on success.
- * Returns -1 on error.
- */
-int64_t bgzf_tell(BGZF* fp);
-
-/*
- * Set the file to read from the location specified by pos, which must
- * be a value previously returned by bgzf_tell for this file (but not
- * necessarily one returned by this file handle).
- * The where argument must be SEEK_SET.
- * Seeking on a file opened for write is not supported.
- * Returns zero on success, -1 on error.
- */
-int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif