From: mikaels <mikaels@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Date: Fri, 10 Apr 2009 15:07:49 +0000 (+0000)
Subject: Initial import.
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3ba4b5eb3f88fd8f24af8bf83ba218b84faa87ba;p=bamtools.git

Initial import.

git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@2 9efb377e-2e27-44b9-b91a-ec4abb80ed8b
---

diff --git a/BamAlignment.h b/BamAlignment.h
new file mode 100644
index 0000000..b299ab3
--- /dev/null
+++ b/BamAlignment.h
@@ -0,0 +1,92 @@
+// BamAlignment.h
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 20 March 2009
+
+#ifndef BAMALIGNMENT_H
+#define BAMALIGNMENT_H
+
+#include <stdint.h>
+
+// C++ includes
+#include <string>
+using std::string;
+
+#include <vector>
+using std::vector;
+
+struct CigarOp {
+	uint32_t Length;
+	char     Type;
+};
+
+struct RefData {
+  string       RefName;
+  unsigned int RefLength;
+  bool         RefHasAlignments;
+
+  // constructor
+  RefData(void)
+  : RefLength(0)
+  , RefHasAlignments(false)
+  { }
+};
+
+typedef vector<RefData> RefVector;
+
+struct BamAlignment {
+
+    // queries against alignment flag - see below for further detail
+    public:
+        bool IsPaired(void) const            { return ( (AlignmentFlag & PAIRED)        != 0 ); }
+        bool IsProperPair(void) const        { return ( (AlignmentFlag & PROPER_PAIR)   != 0 ); }
+        bool IsMapped(void) const            { return ( (AlignmentFlag & UNMAPPED)      == 0 ); }
+        bool IsMateMapped(void) const        { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
+        bool IsReverseStrand(void) const     { return ( (AlignmentFlag & REVERSE)       != 0 ); }
+        bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }
+        bool IsFirstMate(void) const         { return ( (AlignmentFlag & READ_1)        != 0 ); }
+        bool IsSecondMate(void) const        { return ( (AlignmentFlag & READ_2)        != 0 ); }
+        bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY)     == 0 ); }
+        bool IsFailedQC(void) const          { return ( (AlignmentFlag & QC_FAILED)     != 0 ); }
+        bool IsDuplicate(void) const         { return ( (AlignmentFlag & DUPLICATE)     != 0 ); }
+
+    // data members
+    public:
+        string       Name;           // read name
+        unsigned int Length;         // query length
+        string       QueryBases;     // original sequence ( produced from machine )
+        string       AlignedBases;   // aligned sequence ( with indels ) 
+        string       Qualities;      // FASTQ qualities ( still in ASCII characters )
+		vector<string> Tags;
+        unsigned int RefID;          // ID for reference sequence
+        unsigned int Position;       // position on reference sequence where alignment starts
+        unsigned int Bin;            // bin in BAM file where this alignment resides
+        unsigned int MapQuality;     // mapping quality 
+        unsigned int AlignmentFlag;  // see above for available queries
+		vector<CigarOp> CigarData;   // vector of CIGAR operations (length & type) )
+        unsigned int MateRefID;      // ID for reference sequence that mate was aligned to
+        unsigned int MatePosition;   // position that mate was aligned to
+        unsigned int InsertSize;     // mate pair insert size
+		
+
+    // alignment flag query constants
+    private:
+        enum { PAIRED        = 1,		// Alignment comes from paired-end data
+               PROPER_PAIR   = 2,		// Alignment passed paired-end resolution
+               UNMAPPED      = 4,		// Read is unmapped
+               MATE_UNMAPPED = 8,		// Mate is unmapped
+               REVERSE       = 16,		// Read is on reverse strand
+               MATE_REVERSE  = 32,		// Mate is on reverse strand
+               READ_1        = 64, 		// This alignment is mate 1 of pair
+               READ_2        = 128,		// This alignment is mate 2 of pair
+               SECONDARY     = 256,		// This alignment is not the primary (best) alignment for read
+               QC_FAILED     = 512,		// Read did not pass prior quality control steps
+               DUPLICATE     = 1024		// Read is PCR duplicate
+        };
+};
+
+// commonly used vector in this library
+typedef vector< BamAlignment > BamAlignmentVector;
+
+#endif /* BAMALIGNMENT_H */
diff --git a/BamConversionMain.cpp b/BamConversionMain.cpp
new file mode 100644
index 0000000..457e58e
--- /dev/null
+++ b/BamConversionMain.cpp
@@ -0,0 +1,46 @@
+#include <iostream>
+#include "BamReader.h"
+#include "BamWriter.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+
+	if(argc != 3) {
+		cout << "USAGE: " << argv[0] << " <input BAM file> <output BAM file>" << endl;
+		exit(1);
+	}
+
+	// localize our arguments
+	const char* inputFilename  = argv[1];
+	const char* outputFilename = argv[2];
+
+	// open our BAM reader
+	BamReader reader;
+	reader.SetFilename(inputFilename);
+
+	if(!reader.Open()) {
+		cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl;
+		exit(1);
+	}
+
+	// retrieve the SAM header text
+	string samHeader = reader.GetHeaderText();
+
+	// retrieve the reference sequence vector
+	RefVector referenceSequences = reader.GetReferenceData();
+
+	// open the BAM writer
+	BamWriter writer;
+	writer.Open(outputFilename, samHeader, referenceSequences);
+
+	// copy all of the reads from the input file to the output file
+	BamAlignment al;
+	while(reader.GetNextAlignment(al)) writer.SaveAlignment(al);
+
+	// close our files
+	reader.Close();
+	writer.Close();
+
+	return 0;
+}
\ No newline at end of file
diff --git a/BamReader.cpp b/BamReader.cpp
new file mode 100644
index 0000000..8fba1a9
--- /dev/null
+++ b/BamReader.cpp
@@ -0,0 +1,653 @@
+// BamReader.cpp
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 6 April 2009
+
+#include "BamReader.h"
+#include <iostream>
+using std::cerr;
+using std::endl;
+
+// static character constants
+const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
+const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
+
+BamReader::BamReader(const char* filename, const char* indexFilename) 
+	: m_filename( (char*)filename )
+	, m_indexFilename( (char*)indexFilename )
+	, m_file(0)
+	, m_index(NULL)
+	, m_headerText("")
+	, m_isOpen(false)
+	, m_isIndexLoaded(false)
+	, m_isRegionSpecified(false)
+	, m_isCalculateAlignedBases(true)
+	, m_currentRefID(0)
+	, m_currentLeft(0)
+	, m_alignmentsBeginOffset(0)
+{
+	Open();
+}
+
+BamReader::~BamReader(void) {
+
+	// close file
+	if ( m_isOpen ) { Close(); }
+}
+
+// open BAM file
+bool BamReader::Open(void) {
+
+	if (!m_isOpen && m_filename != NULL ) {
+
+		// open file
+		m_file = bam_open(m_filename, "r"); 
+		
+		// get header info && index info
+		if ( (m_file != NULL) && LoadHeader() ) {
+
+			// save file offset where alignments start
+			m_alignmentsBeginOffset = bam_tell(m_file);
+			
+			// set open flag
+			m_isOpen = true;
+		}
+
+		// try to open (and load) index data, if index file given
+		if ( m_indexFilename != NULL ) {
+			OpenIndex();
+		}
+	}
+
+	return m_isOpen;
+}
+
+bool BamReader::OpenIndex(void) {
+
+	if ( m_indexFilename && !m_isIndexLoaded ) {
+		m_isIndexLoaded = LoadIndex();
+	}
+	return m_isIndexLoaded;
+}
+
+// close BAM file
+bool BamReader::Close(void) {
+	
+	if (m_isOpen) {
+
+		// close file
+		int ret = bam_close(m_file);
+		
+		// delete index info
+		if ( m_index != NULL) { delete m_index; }
+
+		// clear open flag
+		m_isOpen = false;
+
+		// clear index flag
+		m_isIndexLoaded = false;
+
+		// clear region flag
+		m_isRegionSpecified = false;
+
+		// return success/fail of bam_close
+		return (ret == 0);
+	} 
+
+	return true;
+}
+
+// get BAM filename
+const char* BamReader::Filename(void) const { 
+	return (const char*)m_filename; 
+}
+
+// set BAM filename
+void BamReader::SetFilename(const char* filename) {
+	m_filename = (char*)filename;
+}
+
+// get BAM Index filename
+const char* BamReader::IndexFilename(void) const { 
+	return (const char*)m_indexFilename; 
+}
+
+// set BAM Index filename
+void BamReader::SetIndexFilename(const char* indexFilename) {
+	m_indexFilename = (char*)indexFilename;
+}
+
+// return full header text
+const string BamReader::GetHeaderText(void) const { 
+	return m_headerText; 
+}
+
+// return number of reference sequences in BAM file
+const int BamReader::GetReferenceCount(void) const { 
+	return m_references.size();
+}
+
+// get RefID from reference name
+const int BamReader::GetRefID(string refName) const { 
+	
+	vector<string> refNames;
+	RefVector::const_iterator refIter = m_references.begin();
+    RefVector::const_iterator refEnd  = m_references.end();
+    for ( ; refIter != refEnd; ++refIter) {
+		refNames.push_back( (*refIter).RefName );
+    }
+
+	// return 'index-of' refName (if not found, returns refNames.size())
+	return Index( refNames.begin(), refNames.end(), refName );
+}
+
+const RefVector BamReader::GetReferenceData(void) const {
+	return m_references;
+}
+
+bool BamReader::Jump(int refID, unsigned int left) {
+
+	// if index available, and region is valid
+	if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
+		m_currentRefID = refID;
+		m_currentLeft  = left;
+		m_isRegionSpecified = true;
+		return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );
+	}
+	return false;
+}
+
+bool BamReader::Rewind(void) {
+
+	int refID = 0;
+	int refCount = m_references.size();
+	for ( ; refID < refCount; ++refID ) {
+		if ( m_references.at(refID).RefHasAlignments ) { break; } 
+	}
+
+	m_currentRefID = refID;
+	m_currentLeft = 0;
+	m_isRegionSpecified = false;
+
+	return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );
+}	
+
+// get next alignment from specified region
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
+
+	// try to load 'next' read
+	if ( LoadNextAlignment(bAlignment) ) {
+
+		// if specified region, check for region overlap
+		if ( m_isRegionSpecified ) {
+
+			// if overlap, return true
+			if ( IsOverlap(bAlignment) ) { return true; }
+			// if not, try the next alignment
+			else { return GetNextAlignment(bAlignment); }
+		} 
+
+		// not using region, valid read detected, return success
+		else { return true; }
+	}
+
+	// no valid alignment to load
+	return false;
+}
+
+void BamReader::SetCalculateAlignedBases(bool flag) {
+	m_isCalculateAlignedBases = flag;
+}
+
+int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
+
+	// get region boundaries
+	uint32_t begin = left;
+	uint32_t end   = m_references.at(refID).RefLength - 1;
+
+	// initialize list, bin '0' always a valid bin
+	int i = 0;
+	list[i++] = 0;
+
+	// get rest of bins that contain this region
+	unsigned int k;
+	for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }
+	for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }
+	for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }
+	for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }
+	for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
+	
+	// return number of bins stored
+	return i;
+}
+
+uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
+
+	// initialize alignment end to starting position
+	uint32_t alignEnd = position;
+
+	// iterate over cigar operations
+	vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
+	vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
+	for ( ; cigarIter != cigarEnd; ++cigarIter) {
+		if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {
+			alignEnd += (*cigarIter).Length;
+		}
+	}
+	return alignEnd;
+}
+
+uint64_t BamReader::GetOffset(int refID, unsigned int left) {
+
+	//  make space for bins
+	uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); 		
+	
+	// returns number of bins overlapping (left, right)
+	// stores indices of those bins in 'bins'
+	int numBins = BinsFromRegion(refID, left, bins);				
+
+	// access index data for refID
+	RefIndex* refIndex = m_index->at(refID);
+
+	// get list of BAM bins for this reference sequence
+	BinVector* refBins = refIndex->first;
+
+	sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
+
+	// declare ChunkVector
+	ChunkVector regionChunks;
+
+	// declaure LinearOffsetVector
+	LinearOffsetVector* linearOffsets = refIndex->second;
+
+	// some sort of linear offset vs bin index voodoo... not completely sure what's going here
+	uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
+
+	BinVector::iterator binBegin = refBins->begin();
+	BinVector::iterator binEnd   = refBins->end();
+
+	// iterate over bins overlapping region, count chunks
+	for (int i = 0; i < numBins; ++i) {
+		
+		// look for bin with ID=bin[i]
+		BinVector::iterator binIter = binBegin;
+
+		for ( ; binIter != binEnd; ++binIter ) {
+		
+			// if found, increment n_off by number of chunks for each bin
+			if ( (*binIter).first == (uint32_t)bins[i] ) { 
+				
+				// iterate over chunks in that bin
+				ChunkVector* binChunks = (*binIter).second;
+				
+				ChunkVector::iterator chunkIter = binChunks->begin();
+				ChunkVector::iterator chunkEnd  = binChunks->end();
+				for ( ; chunkIter != chunkEnd; ++chunkIter) {
+				
+					// if right bound of pair is greater than min_off (linear offset value), store pair
+					if ( (*chunkIter).second > minOffset) { 
+						regionChunks.push_back( (*chunkIter) ); 
+					}
+				}
+			}
+		}
+	}
+
+	// clean up temp array
+	free(bins);
+
+	// there should be at least 1
+	assert(regionChunks.size() > 0);
+
+	// sort chunks by start position
+	sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );
+
+	// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+	int numOffsets = regionChunks.size();	
+	for (int i = 1; i < numOffsets; ++i) {
+		if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {
+			regionChunks.at(i-1).second = regionChunks.at(i).first;
+		}
+	}
+	
+	// merge adjacent chunks
+	int l = 0;
+	for (int i = 1; i < numOffsets; ++i) {
+		// if adjacent, expand boundaries of (merged) chunk
+		if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {
+			regionChunks.at(l).second = regionChunks.at(i).second;
+		}
+		// else, move on to next (merged) chunk index
+		else { regionChunks.at(++l) = regionChunks.at(i); }
+	}
+
+	// return beginning file offset of first chunk for region
+	return regionChunks.at(0).first;
+}
+
+bool BamReader::IsOverlap(BamAlignment& bAlignment) {
+
+	// if on different reference sequence, quit
+	if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }
+
+	// read starts after left boundary
+	if ( bAlignment.Position >= m_currentLeft) { return true; }
+
+	// get alignment end
+	uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);
+
+	// return whether alignment end overlaps left boundary
+	return ( alignEnd >= m_currentLeft );
+}
+
+bool BamReader::LoadHeader(void) {
+
+	// check to see if proper BAM header
+	char buf[4];
+	if (bam_read(m_file, buf, 4) != 4) { return false; }
+	if (strncmp(buf, "BAM\001", 4)) {
+		fprintf(stderr, "wrong header type!\n");
+		return false;
+	}
+	
+	// get BAM header text length
+	int32_t headerTextLength;
+	bam_read(m_file, &headerTextLength, 4);
+
+	// get BAM header text
+	char* headerText = (char*)calloc(headerTextLength + 1, 1);
+	bam_read(m_file, headerText, headerTextLength);
+	m_headerText = (string)((const char*)headerText);
+	
+	// clean up calloc-ed temp variable
+	free(headerText);
+
+	// get number of reference sequences
+	int32_t numberRefSeqs;
+	bam_read(m_file, &numberRefSeqs, 4);
+	if (numberRefSeqs == 0) { return false; }
+
+	m_references.reserve((int)numberRefSeqs);
+	
+	// reference variables
+	int32_t  refNameLength;
+	char*    refName;
+	uint32_t refLength;
+
+	// iterate over all references in header
+	for (int i = 0; i != numberRefSeqs; ++i) {
+
+		// get length of reference name
+		bam_read(m_file, &refNameLength, 4);
+		refName = (char*)calloc(refNameLength, 1);
+
+		// get reference name and reference sequence length
+		bam_read(m_file, refName, refNameLength);
+		bam_read(m_file, &refLength, 4);
+
+		// store data for reference
+		RefData aReference;
+		aReference.RefName   = (string)((const char*)refName);
+		aReference.RefLength = refLength;
+		m_references.push_back(aReference);
+
+		// clean up calloc-ed temp variable
+		free(refName);
+	}
+	
+	return true;
+}
+
+bool BamReader::LoadIndex(void) {
+
+	// check to see if index file exists
+	FILE* indexFile;
+	if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {
+		fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");
+		return false;
+	}
+
+	// see if index is valid BAM index
+	char magic[4];
+	fread(magic, 1, 4, indexFile);
+	if (strncmp(magic, "BAI\1", 4)) {
+		fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");
+		fclose(indexFile);
+		return false;
+	}
+
+	// get number of reference sequences
+	uint32_t numRefSeqs;
+	fread(&numRefSeqs, 4, 1, indexFile);
+	
+	// intialize BamIndex data structure
+	m_index = new BamIndex;
+	m_index->reserve(numRefSeqs);
+
+	// iterate over reference sequences
+	for (unsigned int i = 0; i < numRefSeqs; ++i) {
+		
+		// get number of bins for this reference sequence
+		int32_t numBins;
+		fread(&numBins, 4, 1, indexFile);
+		
+		if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
+
+		// intialize BinVector
+		BinVector* bins = new BinVector;
+		bins->reserve(numBins);
+		
+		// iterate over bins for that reference sequence
+		for (int j = 0; j < numBins; ++j) {
+			
+			// get binID 
+			uint32_t binID;
+			fread(&binID, 4, 1, indexFile);
+			
+			// get number of regionChunks in this bin
+			uint32_t numChunks;
+			fread(&numChunks, 4, 1, indexFile);
+			
+			// intialize ChunkVector
+			ChunkVector* regionChunks = new ChunkVector;
+			regionChunks->reserve(numChunks);
+			
+			// iterate over regionChunks in this bin
+			for (unsigned int k = 0; k < numChunks; ++k) {
+				
+				// get chunk boundaries (left, right) 
+				uint64_t left;
+				uint64_t right;
+				fread(&left, 8, 1, indexFile);
+				fread(&right, 8, 1, indexFile);
+				
+				// save ChunkPair
+				regionChunks->push_back( ChunkPair(left, right) );
+			}
+			
+			// save binID, chunkVector for this bin
+			bins->push_back( BamBin(binID, regionChunks) );
+		}
+		
+		// load linear index for this reference sequence
+		
+		// get number of linear offsets
+		int32_t numLinearOffsets;
+		fread(&numLinearOffsets, 4, 1, indexFile);
+		
+		// intialize LinearOffsetVector
+		LinearOffsetVector* linearOffsets = new LinearOffsetVector;
+		linearOffsets->reserve(numLinearOffsets);
+		
+		// iterate over linear offsets for this reference sequeence
+		for (int j = 0; j < numLinearOffsets; ++j) {
+			// get a linear offset
+			uint64_t linearOffset;
+			fread(&linearOffset, 8, 1, indexFile);
+			// store linear offset
+			linearOffsets->push_back(linearOffset);
+		}
+		
+		// store index data for that reference sequence
+		m_index->push_back( new RefIndex(bins, linearOffsets) );
+	}
+	
+	// close index file (.bai) and return
+	fclose(indexFile);
+	return true;
+}
+
+bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
+
+	// check valid alignment block header data
+	int32_t block_len;
+	int32_t ret;
+	uint32_t x[8];
+
+	int32_t bytesRead = 0;
+
+	// read in the 'block length' value, make sure it's not zero
+	if ( (ret = bam_read(m_file, &block_len, 4)) == 0 )        { return false; }
+	bytesRead += 4;
+
+	// read in core alignment data, make the right size of data was read 
+	if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
+	bytesRead += BAM_CORE_SIZE;
+
+	// set BamAlignment 'core' data
+	bAlignment.RefID         = x[0]; 
+	bAlignment.Position      = x[1];
+	bAlignment.Bin           = x[2]>>16; 
+	bAlignment.MapQuality    = x[2]>>8&0xff; 
+	bAlignment.AlignmentFlag = x[3]>>16; 
+	bAlignment.MateRefID     = x[5]; 
+	bAlignment.MatePosition  = x[6]; 
+	bAlignment.InsertSize    = x[7];
+
+	// fetch & store often-used lengths for character data parsing
+	unsigned int queryNameLength     = x[2]&0xff;
+	unsigned int numCigarOperations  = x[3]&0xffff;
+	unsigned int querySequenceLength = x[4];
+	
+	// get length of character data
+	int dataLength = block_len - BAM_CORE_SIZE;
+
+	// set up destination buffers for character data
+	uint8_t*  allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);
+	uint32_t* cigarData   = (uint32_t*)(allCharData+queryNameLength);
+	
+	// get character data - make sure proper data size was read
+	if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }
+	else {
+
+		bytesRead += dataLength;
+
+		// clear out bases, qualities, aligned bases, and CIGAR
+		bAlignment.QueryBases.clear();
+		bAlignment.Qualities.clear();
+		bAlignment.AlignedBases.clear();
+		bAlignment.CigarData.clear();
+
+		// save name
+		bAlignment.Name = (string)((const char*)(allCharData));
+		
+		// save bases
+		char singleBase[2];
+		uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);
+		for (unsigned int i = 0; i < querySequenceLength; ++i) { 
+			// numeric to char conversion
+			sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );
+			// append character data to Bases
+			bAlignment.QueryBases.append( (const char*)singleBase );
+		}
+		
+		// save sequence length
+		bAlignment.Length = bAlignment.QueryBases.length();
+		
+		// save qualities
+		char singleQuality[4];
+		uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);
+		for (unsigned int i = 0; i < querySequenceLength; ++i) { 
+			// numeric to char conversion
+			sprintf( singleQuality, "%c", ( t[i]+33 ) ); 
+			// append character data to Qualities
+			bAlignment.Qualities.append( (const char*)singleQuality );
+		}
+		
+		// save CIGAR-related data;
+		int k = 0;
+		for (unsigned int i = 0; i < numCigarOperations; ++i) {
+
+			// build CigarOp struct
+			CigarOp op;
+			op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
+			op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
+
+			// save CigarOp
+			bAlignment.CigarData.push_back(op);
+
+			// can skip this step if user wants to ignore
+			if (m_isCalculateAlignedBases) {
+
+				// build AlignedBases string
+				switch (op.Type) {
+					
+					case ('M') : 
+					case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); 	// for 'M', 'I' - write bases
+					case ('S') : k += op.Length;                                                     			// for 'S' - skip over query bases
+								 break;
+					
+					case ('D') : 
+					case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );	// for 'D', 'P' - write padding;
+								 break;
+					
+					case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
+								 k += op.Length;
+								 break;
+
+					case ('H') : break; 											// for 'H' - do nothing, move to next op
+					
+					default    : assert(false);	// shouldn't get here
+								 break;
+				}
+			}
+		}       
+	}
+	free(allCharData);
+
+/*
+	// (optional) read tag parsing data
+	string tag;
+	char data;
+	int i = 0;
+
+	// still data to read (auxiliary tags)
+	while ( bytesRead < block_len ) {
+
+		if ( bam_read(m_file, &data, 1) == 1 ) { 
+			
+			++bytesRead;
+
+			if (bytesRead == block_len && data != '\0') {
+				fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");
+				exit(1);
+			}
+
+			tag.append(1, data);
+			if ( data == '\0' ) {
+				bAlignment.Tags.push_back(tag);
+				tag = "";
+				i = 0;
+			} else {
+				if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }
+				++i;
+			}
+		} else {
+			fprintf(stderr, "ERROR: Could not read tag info.\n");
+			exit(1);
+		}
+	}
+*/
+	return true;
+}
diff --git a/BamReader.h b/BamReader.h
new file mode 100644
index 0000000..786f4c2
--- /dev/null
+++ b/BamReader.h
@@ -0,0 +1,230 @@
+// BamReader.h
+
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+	Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package 
+        (thus the carryover of above MIT license)
+	Contact: Derek Barnett <barnetde@bc.edu>
+*/
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 6 April 2009
+
+#ifndef BAMREADER_H
+#define BAMREADER_H
+
+// custom includes
+#include "BamAlignment.h"
+#include "STLUtilities.h"
+
+// C library includes
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// BGZF library includes/defines
+#include "bgzf.h"
+typedef BGZF* BamFile;
+#define bam_open(f_name, mode)      bgzf_open(f_name, mode)
+#define bam_close(f_ptr)            bgzf_close(f_ptr)
+#define bam_read(f_ptr, buf, size)  bgzf_read(f_ptr, buf, size)
+#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)
+#define bam_tell(f_ptr)             bgzf_tell(f_ptr)
+#define bam_seek(f_ptr, pos, dir)   bgzf_seek(f_ptr, pos, dir)
+
+// size of alignment data block in BAM file (bytes)
+#define BAM_CORE_SIZE 32
+
+// BAM indexing constants
+#define MAX_BIN           37450 // =(8^6-1)/7+1
+#define BAM_MIN_CHUNK_GAP 32768 
+#define BAM_LIDX_SHIFT    14
+
+// CIGAR-retrieval mask/shift constants
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+
+// CIGAR-operation types
+#define BAM_CMATCH      0
+#define BAM_CINS        1
+#define BAM_CDEL        2
+#define BAM_CREF_SKIP   3
+#define BAM_CSOFT_CLIP  4
+#define BAM_CHARD_CLIP  5
+#define BAM_CPAD        6
+
+// --------------------------- //
+// Bam header info
+// --------------------------- //
+
+// --------------------------- //
+// BamIndex-related typedefs
+// --------------------------- //
+
+// offset for linear indexing
+typedef vector<uint64_t> LinearOffsetVector;
+
+// chunk boundaries
+typedef pair<uint64_t, uint64_t> ChunkPair;
+// list of chunks in a BAM bin
+typedef vector<ChunkPair> ChunkVector;
+
+// BAM bins for a reference sequence
+// replaces khash - uint32_t is key, ChunkVector is value
+typedef pair<uint32_t, ChunkVector*> BamBin;
+typedef vector<BamBin> BinVector;
+
+// each reference sequence has a BinVector and LinearOffsetVector
+typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
+
+// full BamIndex defined as: 
+typedef vector<RefIndex*> BamIndex;
+
+// ---------------------------------------------------------------------------//
+
+class BamReader {
+	
+	public:
+		// constructors
+		BamReader(const char* fileName = NULL, const char* indexFilename = NULL);
+
+	public:
+		// destructor
+		~BamReader(void);
+	
+	// BAM interface methods
+	public:
+
+		// ----------------------- //
+		// File manipulation
+		// ----------------------- //
+		
+		// open BAM file (automatically opens index if provided)
+		bool Open(void);
+		
+		// open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)
+		bool OpenIndex(void);
+		
+		// close BAM file
+		bool Close(void);
+		
+		// get BAM filename
+		const char* Filename(void) const;
+	
+		// set BAM filename
+		void SetFilename(const char*);
+
+		// get BAM Index filename
+		const char* IndexFilename(void) const;
+		
+		// set BAM Index filename
+		void SetIndexFilename(const char*);
+
+		// ----------------------- //
+		// Access BAM header
+		// ----------------------- //
+		
+		// return full header text
+		const string GetHeaderText(void) const;
+		
+		// --------------------------------- //
+		// Access reference sequence info
+		// --------------------------------- //
+		
+		// return number of reference sequences in BAM file
+		const int GetReferenceCount(void) const;
+
+		// return vector of RefData entries
+		const RefVector GetReferenceData(void) const;
+
+		// get refID from reference name
+		const int GetRefID(string refName) const;		
+		
+		// ----------------------------------------- //
+		// File position moving
+		// ----------------------------------------- //
+
+		// jumps to 'left' position on refID
+		// actually jumps before position, so reads that overlap 'left' are included as well
+		// 'left' defaults to reference begin if not specified
+		bool Jump(int refID, unsigned int left = 0);
+
+		// Jump to beginning of BAM file, clears any region previously set by Jump()
+		bool Rewind(void);
+		
+		// ------------------------------ //
+		// Access alignments
+		// ------------------------------ //
+		
+		// get next alignment
+		bool GetNextAlignment(BamAlignment& read);
+
+		// allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded
+		void SetCalculateAlignedBases(bool);
+
+	// internal utility methods
+	private:
+		int      BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
+		uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);
+		uint64_t GetOffset(int, unsigned int);
+		bool     IsOverlap(BamAlignment&);
+		bool     LoadHeader(void);
+		bool     LoadIndex(void);
+		bool     LoadNextAlignment(BamAlignment&);
+
+	private:	
+		// main BAM reader components
+		char*      m_filename;
+		char*      m_indexFilename;
+		BamFile    m_file;
+		BamIndex*  m_index;
+		RefVector  m_references;
+		string     m_headerText;
+
+		// state flags
+		bool m_isOpen;			         // BAM file is open for processing
+		bool m_isIndexLoaded;	         // BAM Index data is loaded and available for processing
+		bool m_isRegionSpecified;        // a region has been specified - specifically, a user has called Jump()
+		bool m_isCalculateAlignedBases;  // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)
+
+		// region values
+		int m_currentRefID;
+		unsigned int m_currentLeft;
+
+		// file offset of 1st read in BAM file
+		int64_t m_alignmentsBeginOffset;
+
+	private:
+		// BAM character constants
+		static const char* DNA_LOOKUP;
+		static const char* CIGAR_LOOKUP;
+};
+
+#endif /* BAMREADER_H */
diff --git a/BamReaderMain.cpp b/BamReaderMain.cpp
new file mode 100644
index 0000000..bdf4e60
--- /dev/null
+++ b/BamReaderMain.cpp
@@ -0,0 +1,255 @@
+// BamReaderMain.cpp
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 6 April 2009
+
+#include "BamReader.h"
+
+// C++ includes
+#include <iostream>
+using std::cerr;
+using std::cout;
+using std::endl;
+
+#include <vector>
+using std::vector;
+
+#include <string>
+using std::string;
+
+int main(int argc, char* argv[]) {
+
+	// check command line parameters
+	if (argc != 3) {
+		cerr << endl;
+		cerr << "Usage: BamReaderTest <bamFile> <bamIndexFile>" << endl;
+		cerr << endl;
+		exit(1);
+	}
+
+	// get filenames from command line
+	const char* bamFilename = argv[1];
+	const char* bamIndexFilename = argv[2];
+
+	string refName;
+	int refID;
+	unsigned int leftBound;
+	unsigned int rightBound;
+
+	int alignmentCount;
+
+	vector<string> refNames;
+
+	BamAlignment bAlignment;
+	BamAlignmentVector alignments;
+
+	RefVector references;
+
+	// ------------------------------------------------------------------------------------------------------ //
+	// Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information
+	// ------------------------------------------------------------------------------------------------------ //
+	
+	BamReader bReader(bamFilename, bamIndexFilename);
+
+	cerr << endl;
+	cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... ";
+	
+	if ( bReader.Open() ) { 
+		cerr << "ok" << endl; 
+	}
+	else {
+		cerr << "error" << endl;
+		exit(1);
+	}
+	
+	// ------------------------------------------------------------ //
+	// Find out how many reference sequences are in BAM file
+	// ------------------------------------------------------------ //
+	
+	references = bReader.GetReferenceData();
+
+	cerr << endl;
+	cerr << "Total number of ref seqs: " << references.size() << endl;
+	
+	// ---------------------------------------------------------------------------- //
+	// Get the names/lengths of all the reference sequences that have alignments
+	// ---------------------------------------------------------------------------- //
+	
+	cerr << endl;
+	cerr << "All ref sequences with alignments:" << endl;
+	cerr << endl;
+
+
+	if ( !references.empty() ) {
+		
+		RefVector::iterator refIter = references.begin();
+		RefVector::iterator refEnd  = references.end();
+		
+		refID = 0;
+		// iterate over reference names, print to STDERR
+		for( ; refIter != refEnd; ++refIter) {
+
+			if ( (*refIter).RefHasAlignments ) {
+				cerr << "ID:     " << refID << endl;
+				cerr << "Name:   " << (*refIter).RefName << endl;
+				cerr << "Length: " << (*refIter).RefLength << endl;
+				cerr << endl;
+			}
+
+			++refID;
+		}
+	}
+
+	// --------------------------------------------------------------------------------------------- //
+	// Get the SAM-style header text, if available (not available in the example file shown here)
+	// --------------------------------------------------------------------------------------------- //	
+	
+	cerr << "------------------------------------" << endl;
+	cerr << "SAM header text: " << endl;
+	
+	// get (SAM) header text
+	string header = bReader.GetHeaderText();
+	
+	cerr << ( (header.empty()) ? "no header data" : header ) << endl;
+	cerr << "------------------------------------" << endl;
+
+	// --------------------------------------------------------------------------------------------- //
+	// Here we start accessing alignments
+	// The first method shows how to iterate over all reads in a BamFile
+	// This method will work on any BAM file (sorted/non-sorted, with/without an index)
+	// --------------------------------------------------------------------------------------------- //
+
+	// Call Rewind() to make sure you're at the 1st alignment in the file
+	// Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment
+	// but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code
+
+	cerr << "Getting (up to) first 1000 alignments" << endl;
+
+	// start at 1st alignment
+	if ( bReader.Rewind() ) {
+		
+		alignmentCount = 0;
+		while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {
+		
+			// disregard unmapped alignments
+			if ( bAlignment.IsMapped() ) {
+
+				++alignmentCount;
+				
+				cout << "----------------------------" << endl;
+				cout << "Alignment " << alignmentCount << endl;
+				cout << bAlignment.Name << endl;
+				cout << bAlignment.AlignedBases << endl;
+				cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;
+				
+				cout << "Cigar Data: " << endl;
+
+				vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();
+				vector<CigarOp>::const_iterator cigarEnd  = bAlignment.CigarData.end();
+				for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+					cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl;
+				}
+
+				cout << "Tags: " << endl;
+				vector<string>::const_iterator tagIter = bAlignment.Tags.begin();
+				vector<string>::const_iterator tagEnd  = bAlignment.Tags.end();
+				for ( ; tagIter != tagEnd; ++tagIter ) {
+					cout << (*tagIter) << endl;
+				}
+			}
+		}
+
+		cerr << "Found " << alignmentCount << " alignments." << endl;
+	} else { cerr << "Could not rewind" << endl; }
+
+	// ---------------------------------------------------------------------------------------------------------- //
+	// You can iterate over each individual alignment that overlaps a specified region
+	// Set the refID & left boundary parameters using Jump(refID, left)
+	// Jump() actually positions the file pointer actually before the left boundary
+	// This ensures that reads beginning before, but overlapping 'left' are included as well 
+	// Client is responsible for setting/checking right boundary - see below
+	// ---------------------------------------------------------------------------------------------------------- //
+/*
+	cerr << "Jumping to region" << endl;
+
+	// get refID using a known refName
+	refName = "seq1";
+	refID = bReader.GetRefID(refName);
+	if (refID == (int)references.size()) { 
+		cerr << "Reference " << refName << " not found." << endl;
+		exit(1);
+	}
+
+	// set left boundary
+	leftBound = 500;
+
+	// set right boundary - either user-specified number to set a discrete region
+	//                      OR you can query the BamReader for the end of the reference
+	rightBound = references.at(refID).RefLength;
+
+	cerr << endl;
+	cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl;
+
+	// set region - specific region on reference
+	if ( bReader.Jump(refID, leftBound) ) { 
+
+		alignmentCount = 0;
+		while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) {
+		
+			if ( bAlignment.IsMapped() ) {
+
+				++alignmentCount;
+			
+				if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; }
+			
+				cout << "----------------------------" << endl;
+				cout << "Alignment " << alignmentCount << endl;
+				cout << bAlignment.Name << endl;
+				cout << bAlignment.AlignedBases << endl;
+				cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;
+			}
+		}
+
+		cerr << "Found " << alignmentCount << " alignments." << endl; 
+	} else { cerr << "Could not jump to region specified" << endl; }
+
+	// ----------------------------------------------------------------------------------------------------- //
+	// You can 'rewind' back to beginning of BAM file at any time 
+	// ----------------------------------------------------------------------------------------------------- //
+
+	cerr << endl;
+	cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl;
+
+	alignmentCount = 0;
+	if (bReader.Rewind() ) {
+		while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {
+		
+			// disregard unmapped alignments
+			if ( bAlignment.IsMapped() ) {
+
+				++alignmentCount;
+			
+				cout << "----------------------------" << endl;
+				cout << "Alignment " << alignmentCount << endl;
+				cout << bAlignment.Name << endl;
+				cout << bAlignment.AlignedBases << endl;
+				cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;
+			}
+		}
+
+		cerr << "Found " << alignmentCount << " alignments." << endl;
+	} else { cerr << "Could not rewind" << endl; }
+*/
+	// ---------------------------------------------------------------------- //
+	// Close BamReader object (releases internal header/index data) and exit
+	// ---------------------------------------------------------------------- //
+	
+	cerr << endl;
+	cerr << "Closing BAM file: " << bamFilename << endl;
+	
+	bReader.Close();
+
+	cerr << "Exiting..." << endl << endl;
+	return 0;
+}
diff --git a/BamWriter.cpp b/BamWriter.cpp
new file mode 100644
index 0000000..24544b3
--- /dev/null
+++ b/BamWriter.cpp
@@ -0,0 +1,403 @@
+// ***************************************************************************
+// BamWriter (c) 2009 Michael Strömberg
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include "BamWriter.h"
+
+// constructor
+BamWriter::BamWriter(void)
+{}
+
+// destructor
+BamWriter::~BamWriter(void) {
+	if(mBGZF.IsOpen) BgzfClose();
+}
+
+// closes the BAM file
+void BamWriter::BgzfClose(void) {
+
+	mBGZF.IsOpen = false;
+
+	// flush the BGZF block
+	BgzfFlushBlock();
+
+	// flush and close
+	fflush(mBGZF.Stream);
+	fclose(mBGZF.Stream);
+}
+
+// compresses the current block
+int BamWriter::BgzfDeflateBlock(void) {
+
+	// initialize the gzip header
+	char* buffer = mBGZF.CompressedBlock;
+	unsigned int bufferSize = mBGZF.CompressedBlockSize;
+
+	memset(buffer, 0, 18);
+	buffer[0]  = GZIP_ID1;
+	buffer[1]  = (char)GZIP_ID2;
+	buffer[2]  = CM_DEFLATE;
+	buffer[3]  = FLG_FEXTRA;
+	buffer[9]  = (char)OS_UNKNOWN;
+	buffer[10] = BGZF_XLEN;
+	buffer[12] = BGZF_ID1;
+	buffer[13] = BGZF_ID2;
+	buffer[14] = BGZF_LEN;
+
+	// loop to retry for blocks that do not compress enough
+	int inputLength = mBGZF.BlockOffset;
+	int compressedLength = 0;
+
+	while(true) {
+
+		z_stream zs;
+		zs.zalloc    = NULL;
+		zs.zfree     = NULL;
+		zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
+		zs.avail_in  = inputLength;
+		zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
+		zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+		// initialize the zlib compression algorithm
+		if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
+			printf("ERROR: zlib deflate initialization failed.\n");
+			exit(1);
+		}
+
+		// compress the data
+		int status = deflate(&zs, Z_FINISH);
+		if(status != Z_STREAM_END) {
+			deflateEnd(&zs);
+
+			// reduce the input length and try again
+			if(status == Z_OK) {
+				inputLength -= 1024;
+				if(inputLength < 0) {
+					printf("ERROR: input reduction failed.\n");
+					exit(1);
+				}
+				continue;
+			}
+
+			printf("ERROR: zlib deflate failed.\n");
+			exit(1);
+		}
+
+		// finalize the compression routine
+		if(deflateEnd(&zs) != Z_OK) {
+			printf("ERROR: deflate end failed.\n");
+			exit(1);
+		}
+
+		compressedLength = zs.total_out;
+		compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+
+		if(compressedLength > MAX_BLOCK_SIZE) {
+			printf("ERROR: deflate overflow.\n");
+			exit(1);
+		}
+
+		break;
+	}
+
+	// store the compressed length
+	BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
+
+	// store the CRC32 checksum
+	unsigned int crc = crc32(0, NULL, 0);
+	crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
+	BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
+	BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+	// ensure that we have less than a block of data left
+	int remaining = mBGZF.BlockOffset - inputLength;
+	if(remaining > 0) {
+		if(remaining > inputLength) {
+			printf("ERROR: remainder too large.\n");
+			exit(1);
+		}
+
+		memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
+	}
+
+	mBGZF.BlockOffset = remaining;
+	return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BamWriter::BgzfFlushBlock(void) {
+
+	// flush all of the remaining blocks
+	while(mBGZF.BlockOffset > 0) {
+
+		// compress the data block
+		int blockLength = BgzfDeflateBlock();
+
+		// flush the data to our output stream
+		int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
+
+		if(numBytesWritten != blockLength) {
+			printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
+			exit(1);
+		}
+
+		mBGZF.BlockAddress += blockLength;
+	}
+}
+
+// opens the BAM file for writing
+void BamWriter::BgzfOpen(const string& filename) {
+
+	mBGZF.Stream = fopen(filename.c_str(), "wb");
+
+	if(!mBGZF.Stream) {
+		printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
+		exit(1);
+	}
+
+	mBGZF.IsOpen = true;
+}
+
+// writes the supplied data into the BGZF buffer
+unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
+
+	// initialize
+	unsigned int numBytesWritten = 0;
+	const char* input = data;
+	unsigned int blockLength = mBGZF.UncompressedBlockSize;
+
+	// copy the data to the buffer
+	while(numBytesWritten < dataLen) {
+		unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
+		char* buffer = mBGZF.UncompressedBlock;
+		memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
+
+		mBGZF.BlockOffset += copyLength;
+		input             += copyLength;
+		numBytesWritten   += copyLength;
+
+		if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
+	}
+
+	return numBytesWritten;
+}
+
+// closes the alignment archive
+void BamWriter::Close(void) {
+	if(mBGZF.IsOpen) BgzfClose();
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+	// initialize
+	const unsigned int numCigarOperations = cigarOperations.size();
+	packedCigar.resize(numCigarOperations * SIZEOF_INT);
+
+	// pack the cigar data into the string
+	unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+	unsigned int cigarOp;
+	vector<CigarOp>::const_iterator coIter;
+	for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
+
+		switch(coIter->Type) {
+	  case 'M':
+		  cigarOp = BAM_CMATCH;
+		  break;
+	  case 'I':
+		  cigarOp = BAM_CINS;
+		  break;
+	  case 'D':
+		  cigarOp = BAM_CDEL;
+		  break;
+	  case 'N':
+		  cigarOp = BAM_CREF_SKIP;
+		  break;
+	  case 'S':
+		  cigarOp = BAM_CSOFT_CLIP;
+		  break;
+	  case 'H':
+		  cigarOp = BAM_CHARD_CLIP;
+		  break;
+	  case 'P':
+		  cigarOp = BAM_CPAD;
+		  break;
+	  default:
+		  printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+		  exit(1);
+		}
+
+		*pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+		pPackedCigar++;
+	}
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+	// prepare the encoded query string
+	const unsigned int queryLen = query.size();
+	const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
+	encodedQuery.resize(encodedQueryLen);
+	char* pEncodedQuery = (char*)encodedQuery.data();
+	const char* pQuery = (const char*)query.data();
+
+	unsigned char nucleotideCode;
+	bool useHighWord = true;
+
+	while(*pQuery) {
+
+		switch(*pQuery) {
+			case '=':
+				nucleotideCode = 0;
+				break;
+			case 'A':
+				nucleotideCode = 1;
+				break;
+			case 'C':
+				nucleotideCode = 2;
+				break;
+			case 'G':
+				nucleotideCode = 4;
+				break;
+			case 'T':
+				nucleotideCode = 8;
+				break;
+			case 'N':
+				nucleotideCode = 15;
+				break;
+			default:
+				printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+				exit(1);
+		}
+
+		// pack the nucleotide code
+		if(useHighWord) {
+			*pEncodedQuery = nucleotideCode << 4;
+			useHighWord = false;
+		} else {
+			*pEncodedQuery |= nucleotideCode;
+			pEncodedQuery++;
+			useHighWord = true;
+		}
+
+		// increment the query position
+		pQuery++;
+	}
+}
+
+// opens the alignment archive
+void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
+
+	// open the BGZF file for writing
+	BgzfOpen(filename);
+
+	// ================
+	// write the header
+	// ================
+
+	// write the BAM signature
+	const unsigned char SIGNATURE_LENGTH = 4;
+	const char* BAM_SIGNATURE = "BAM\1";
+	BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
+
+	// write the SAM header text length
+	const unsigned int samHeaderLen = samHeader.size();
+	BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
+
+	// write the SAM header text
+	if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
+
+	// write the number of reference sequences
+	const unsigned int numReferenceSequences = referenceSequences.size();
+	BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
+
+	// =============================
+	// write the sequence dictionary
+	// =============================
+
+	RefVector::const_iterator rsIter;
+	for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+
+		// write the reference sequence name length
+		const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
+		BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
+
+		// write the reference sequence name
+		BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+		// write the reference sequence length
+		BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
+	}
+}
+
+// saves the alignment to the alignment archive
+void BamWriter::SaveAlignment(const BamAlignment& al) {
+
+	// initialize
+	const unsigned int nameLen            = al.Name.size() + 1;
+	const unsigned int queryLen           = al.QueryBases.size();
+	const unsigned int numCigarOperations = al.CigarData.size();
+
+	// create our packed cigar string
+	string packedCigar;
+	CreatePackedCigar(al.CigarData, packedCigar);
+	const unsigned int packedCigarLen = packedCigar.size();
+
+	// encode the query
+	string encodedQuery;
+	EncodeQuerySequence(al.QueryBases, encodedQuery);
+	const unsigned int encodedQueryLen = encodedQuery.size();
+
+	// create our read group tag
+	// TODO: add read group tag support when it shows up in the BamAlignment struct
+	//string readGroupTag;
+	//const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
+	//readGroupTag.resize(readGroupTagLen);
+	//char* pReadGroupTag = (char*)readGroupTag.data();
+	//sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
+
+	// assign the BAM core data
+	unsigned int buffer[8];
+	buffer[0] = al.RefID;
+	buffer[1] = al.Position;
+	buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
+	buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+	buffer[4] = queryLen;
+	buffer[5] = al.MateRefID;
+	buffer[6] = al.MatePosition;
+	buffer[7] = al.InsertSize;
+
+	// write the block size
+	const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
+	const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+
+	BgzfWrite((char*)&blockSize, SIZEOF_INT);
+
+	// write the BAM core
+	BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
+
+	// write the query name
+	BgzfWrite(al.Name.c_str(), nameLen);
+
+	// write the packed cigar
+	BgzfWrite(packedCigar.data(), packedCigarLen);
+
+	// write the encoded query sequence
+	BgzfWrite(encodedQuery.data(), encodedQueryLen);
+
+	// write the base qualities
+	string baseQualities = al.Qualities;
+	char* pBaseQualities = (char*)al.Qualities.data();
+	for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
+	BgzfWrite(pBaseQualities, queryLen);
+
+	// write the read group tag
+	// TODO: add read group tag support when it shows up in the BamAlignment struct
+	//BgzfWrite(readGroupTag.c_str(), readGroupTagLen);
+}
diff --git a/BamWriter.h b/BamWriter.h
new file mode 100644
index 0000000..422d1f9
--- /dev/null
+++ b/BamWriter.h
@@ -0,0 +1,145 @@
+// ***************************************************************************
+// BamWriter (c) 2009 Michael Strömberg
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include "BamAlignment.h"
+
+using namespace std;
+
+// our zlib constants
+#define GZIP_ID1             31
+#define GZIP_ID2            139
+#define CM_DEFLATE            8
+#define FLG_FEXTRA            4
+#define OS_UNKNOWN          255
+#define BGZF_XLEN             6
+#define BGZF_ID1             66
+#define BGZF_ID2             67
+#define BGZF_LEN              2
+#define GZIP_WINDOW_BITS    -15
+#define Z_DEFAULT_MEM_LEVEL   8
+
+// our BZGF constants
+#define BLOCK_HEADER_LENGTH    18
+#define BLOCK_FOOTER_LENGTH     8
+#define MAX_BLOCK_SIZE      65536
+#define DEFAULT_BLOCK_SIZE  65536
+
+// our BAM constants
+#define BAM_CORE_SIZE  32
+#define BAM_CMATCH      0
+#define BAM_CINS        1
+#define BAM_CDEL        2
+#define BAM_CREF_SKIP   3
+#define BAM_CSOFT_CLIP  4
+#define BAM_CHARD_CLIP  5
+#define BAM_CPAD        6
+#define BAM_CIGAR_SHIFT 4
+
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+
+// our variable sizes
+#define SIZEOF_INT 4
+
+// define our BZGF structure
+struct BgzfData {
+	unsigned int UncompressedBlockSize;
+	unsigned int CompressedBlockSize;
+	unsigned int BlockLength;
+	unsigned int BlockOffset;
+	uint64_t BlockAddress;
+	bool IsOpen;
+	FILE* Stream;
+	char* UncompressedBlock;
+	char* CompressedBlock;
+
+	// constructor
+	BgzfData(void)
+		: UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+		, CompressedBlockSize(MAX_BLOCK_SIZE)
+		, BlockLength(0)
+		, BlockOffset(0)
+		, BlockAddress(0)
+		, IsOpen(false)
+		, Stream(NULL)
+		, UncompressedBlock(NULL)
+		, CompressedBlock(NULL)
+	{
+		try {
+			CompressedBlock   = new char[CompressedBlockSize];
+			UncompressedBlock = new char[UncompressedBlockSize];
+		} catch(bad_alloc&) {
+			printf("ERROR: Unable to allocate memory for our BGZF object.\n");
+			exit(1);
+		}
+	}
+
+	// destructor
+	~BgzfData(void) {
+		if(CompressedBlock)   delete [] CompressedBlock;
+		if(UncompressedBlock) delete [] UncompressedBlock;
+	}
+};
+
+class BamWriter {
+public:
+	// constructor
+	BamWriter(void);
+	// destructor
+	~BamWriter(void);
+	// closes the alignment archive
+	void Close(void);
+	// opens the alignment archive
+	void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
+	// saves the alignment to the alignment archive
+	void SaveAlignment(const BamAlignment& al);
+private:
+	// closes the BAM file
+	void BgzfClose(void);
+	// compresses the current block
+	int BgzfDeflateBlock(void);
+	// flushes the data in the BGZF block
+	void BgzfFlushBlock(void);
+	// opens the BAM file for writing
+	void BgzfOpen(const string& filename);
+	// packs an unsigned integer into the specified buffer
+	static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
+	// packs an unsigned short into the specified buffer
+	static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
+	// writes the supplied data into the BGZF buffer
+	unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
+	// calculates the minimum bin that contains a region [begin, end)
+	static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
+	// creates a packed cigar string from the supplied alignment
+	static void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
+	// encodes the supplied query sequence into 4-bit notation
+	static void EncodeQuerySequence(const string& query, string& encodedQuery);
+	// our BGZF output object
+	BgzfData mBGZF;
+};
+
+// packs an unsigned integer into the specified buffer
+inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
+	buffer[0] = (char)value;
+	buffer[1] = (char)(value >> 8);
+	buffer[2] = (char)(value >> 16);
+	buffer[3] = (char)(value >> 24);
+}
+
+// packs an unsigned short into the specified buffer
+inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
+	buffer[0] = (char)value;
+	buffer[1] = (char)(value >> 8);
+}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..41ba820
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,35 @@
+CC=		gcc
+CXX=		g++
+CFLAGS=		-Wall -pg -O3 -m64
+CXXFLAGS=	$(CFLAGS)
+DFLAGS=		-D_IOLIB=2 #-D_NDEBUG
+OBJS=		BamReader.o bgzf.o
+PROG=		BamReaderTest
+INCLUDES=       
+ARFLAGS=	-crs
+LIBS=		-lz
+SUBDIRS=	.
+
+.SUFFIXES:.c .cpp .o
+
+.c.o:
+		$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+.cpp.o:
+		$(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all: $(PROG) BamConversion
+
+lib:libbambc.a
+
+libbambc.a:$(OBJS)
+		$(AR) $(ARFLAGS) $@ $(OBJS)
+
+BamReaderTest:lib BamReaderMain.o
+		$(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc
+
+BamConversion: lib BamWriter.o BamConversionMain.o
+	$(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc
+
+clean:
+		rm -fr gmon.out *.o *.a a.out $(PROG) *~
diff --git a/README b/README
new file mode 100644
index 0000000..639bfd1
--- /dev/null
+++ b/README
@@ -0,0 +1,11 @@
+---------------
+README
+----------------
+
+Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory.
+
+When you compile your program, add this to the g++ command:
+
+           -lz -L. -lbambc
+
+and you should be good to go.  (Those are lower case l's, as in 'lion'.)
diff --git a/STLUtilities.h b/STLUtilities.h
new file mode 100644
index 0000000..7a9fbc5
--- /dev/null
+++ b/STLUtilities.h
@@ -0,0 +1,146 @@
+// STLUtilities.h
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 19 March 2009
+
+#ifndef STL_UTILITIES_H
+#define STL_UTILITIES_H
+
+#include <algorithm>
+using std::distance;
+using std::find;
+using std::transform;
+
+#include <functional>
+using std::unary_function;
+
+#include <iterator>
+using std::iterator_traits;
+
+#include <string>
+using std::string;
+
+#include <utility>
+using std::pair;
+
+#include <vector>
+using std::vector;
+
+// -------------------------------------------------------------------------------- //
+// This file contains a sample of STL tricks I've gathered along the way.
+//
+// Many thanks throughout to 'Effective' series by Scott Meyers.
+// Some code is lifted (almost) verbatim from his texts where applicable.
+// ------------------------------------------------------------------------------- //
+
+// --------------------------------------------------------------------------------------------------------------------------------- //
+// STL containers will delete the values they hold when the container is deleted or goes out of scope
+// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects ***
+// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new')
+// Usage example:    for_each( container.begin(), container.end(), DeleteObject() ) 
+//
+// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson   =)
+// --------------------------------------------------------------------------------------------------------------------------------- //
+struct DeleteObject {
+	template<typename T>
+	void operator() (const T* p) const { 
+		delete p; 
+	}
+};
+
+template<typename T>
+void ClearPointerVector(vector<T>& v) {
+	if ( !v.empty() ) {
+		for_each(v.begin(), v.end(), DeleteObject());
+		v.clear();
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------------------------- //
+// Query a vector (other containers? havent tried) for an element
+// Returns the index of that element
+// Returns vector::size() if not found
+// Works with reverse iterators as well... index is counted backward from last element though
+// --------------------------------------------------------------------------------------------------------------------------------- //
+template < typename InputIterator, typename EqualityComparable >
+typename iterator_traits<InputIterator>::difference_type
+Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
+	return distance(begin, find(begin, end, item));
+}
+
+// ----------------------------------------------------------------------------------------------------------------------//
+// This next section is a sort of work-around for the bulky associative maps
+// The idea is to use a *SORTED* vector of pair objects as a lookup vector
+// LookupVector = vector< pair<Key, Value> > 
+// ----------------------------------------------------------------------------------------------------------------------//
+
+// The following template classes allow a templatized comparison function for sorting & searching lookup vector
+
+// allows sorting by Key ( less<Key> )
+template <typename Key, typename Value>
+class LookupKeyCompare {
+
+	typedef pair< Key, Value > LookupData;
+	typedef typename LookupData::first_type Key_t;
+	
+	public:
+		bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
+		bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
+		bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
+	private:
+		bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
+};
+
+// allows sorting by Value ( less<Value> )
+template <typename Key, typename Value>
+class LookupValueCompare {
+
+	typedef pair< Key, Value > LookupData;
+	typedef typename LookupData::second_type Value_t;
+	
+	public:
+		bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); }
+		bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); }
+		bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); }
+	private:
+		bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; }
+};
+
+// The following template functions/structs allow you to pull all keys or all values from this lookup structure
+
+// pull Key from a data pair
+template<typename Key, typename Value>
+struct RipKey:
+	public unary_function< pair<Key,Value>, Key> {
+		Key operator() (pair<Key,Value> dataPair) { 
+			return dataPair.first; 
+		}
+};
+
+// pull Value from a data pair
+template<typename Key, typename Value>
+struct RipValue:
+	public unary_function< pair<Key,Value>, Value> {
+		Value operator() (pair<Key,Value> dataPair) { 
+			return dataPair.second; 
+		}
+};
+
+// pull all Keys from lookup, store in dest (overwrite contents of dest)
+template <typename Key, typename Value>
+void RipKeys( vector< pair<Key,Value> >& lookup, vector<Key>& dest) {
+	dest.clear();
+	dest.reserve(lookup.size());
+	transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey<Key,Value>());
+}
+
+// pull all Values from lookup, store in dest (overwrite contents of dest)
+template <typename Key, typename Value>
+void RipValues( vector< pair<Key,Value> >& lookup, vector<Value>& dest) {
+	dest.clear();
+	dest.reserve(lookup.size());
+	transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue<Key,Value>());
+}
+
+#endif /* STL_UTILITIES_H */
diff --git a/bgzf.c b/bgzf.c
new file mode 100644
index 0000000..4314c70
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,488 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+
+typedef int8_t byte;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+    return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+    buffer[2] = value >> 16;
+    buffer[3] = value >> 24;
+}
+
+inline
+int
+min(int x, int y)
+{
+    return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+    fp->error = message;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+    FILE* file = fdopen(fd, "r");
+    BGZF* fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'r';
+    fp->owned_file = 0;
+    fp->file = file;
+    fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+    fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+static
+BGZF*
+open_write(int fd)
+{
+    FILE* file = fdopen(fd, "w");
+    BGZF* fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'w';
+    fp->owned_file = 0;
+    fp->file = file;
+    fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+    fp->uncompressed_block = NULL;
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+    BGZF* fp = NULL;
+    if (strcasecmp(mode, "r") == 0) {
+	int oflag = O_RDONLY;
+	int fd = open(path, oflag);
+        fp = open_read(fd);
+    } else if (strcasecmp(mode, "w") == 0) {
+	int oflag = O_WRONLY | O_CREAT | O_TRUNC;
+	int fd = open(path, oflag, 0644);
+        fp = open_write(fd);
+    }
+    if (fp != NULL) {
+        fp->owned_file = 1;
+    }
+    return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+    if (strcasecmp(mode, "r") == 0) {
+        return open_read(fd);
+    } else if (strcasecmp(mode, "w") == 0) {
+        return open_write(fd);
+    } else {
+        return NULL;
+    }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+    // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+    // Also adds an extra field that stores the compressed block length.
+
+    byte* buffer = fp->compressed_block;
+    int buffer_size = fp->compressed_block_size;
+
+    // Init gzip header
+    buffer[0] = GZIP_ID1;
+    buffer[1] = GZIP_ID2;
+    buffer[2] = CM_DEFLATE;
+    buffer[3] = FLG_FEXTRA;
+    buffer[4] = 0; // mtime
+    buffer[5] = 0;
+    buffer[6] = 0;
+    buffer[7] = 0;
+    buffer[8] = 0;
+    buffer[9] = OS_UNKNOWN;
+    buffer[10] = BGZF_XLEN;
+    buffer[11] = 0;
+    buffer[12] = BGZF_ID1;
+    buffer[13] = BGZF_ID2;
+    buffer[14] = BGZF_LEN;
+    buffer[15] = 0;
+    buffer[16] = 0; // placeholder for block length
+    buffer[17] = 0;
+
+    // loop to retry for blocks that do not compress enough
+    int input_length = block_length;
+    int compressed_length = 0;
+    while (1) {
+
+        z_stream zs;
+        zs.zalloc = NULL;
+        zs.zfree = NULL;
+        zs.next_in = fp->uncompressed_block;
+        zs.avail_in = input_length;
+        zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+        zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+        int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+                                  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+        if (status != Z_OK) {
+            report_error(fp, "deflate init failed");
+            return -1;
+        }
+        status = deflate(&zs, Z_FINISH);
+        if (status != Z_STREAM_END) {
+            deflateEnd(&zs);
+            if (status == Z_OK) {
+                // Not enough space in buffer.
+                // Can happen in the rare case the input doesn't compress enough.
+                // Reduce the amount of input until it fits.
+                input_length -= 1024;
+                if (input_length <= 0) {
+                    // should never happen
+                    report_error(fp, "input reduction failed");
+                    return -1;
+                }
+                continue;
+            }
+            report_error(fp, "deflate failed");
+            return -1;
+        }
+        status = deflateEnd(&zs);
+        if (status != Z_OK) {
+            report_error(fp, "deflate end failed");
+            return -1;
+        }
+        compressed_length = zs.total_out;
+        compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+        if (compressed_length > MAX_BLOCK_SIZE) {
+            // should never happen
+            report_error(fp, "deflate overflow");
+            return -1;
+        }
+        break;
+    }
+
+    packInt16((uint8_t*)&buffer[16], compressed_length-1);
+    uint32_t crc = crc32(0L, NULL, 0L);
+    crc = crc32(crc, fp->uncompressed_block, input_length);
+    packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+    packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+    int remaining = block_length - input_length;
+    if (remaining > 0) {
+        if (remaining > input_length) {
+            // should never happen (check so we can use memcpy)
+            report_error(fp, "remainder too large");
+            return -1;
+        }
+        memcpy(fp->uncompressed_block,
+               fp->uncompressed_block + input_length,
+               remaining);
+    }
+    fp->block_offset = remaining;
+    return compressed_length;
+}
+
+static
+int
+inflate_block(BGZF* fp, int block_length)
+{
+    // Inflate the block in fp->compressed_block into fp->uncompressed_block
+
+    z_stream zs;
+    zs.zalloc = NULL;
+    zs.zfree = NULL;
+    zs.next_in = fp->compressed_block + 18;
+    zs.avail_in = block_length - 16;
+    zs.next_out = fp->uncompressed_block;
+    zs.avail_out = fp->uncompressed_block_size;
+
+    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+    if (status != Z_OK) {
+        report_error(fp, "inflate init failed");
+        return -1;
+    }
+    status = inflate(&zs, Z_FINISH);
+    if (status != Z_STREAM_END) {
+        inflateEnd(&zs);
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    status = inflateEnd(&zs);
+    if (status != Z_OK) {
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    return zs.total_out;
+}
+
+static
+int
+check_header(const byte* header)
+{
+    return (header[0] == GZIP_ID1 &&
+            header[1] == (byte) GZIP_ID2 &&
+            header[2] == Z_DEFLATED &&
+            (header[3] & FLG_FEXTRA) != 0 &&
+            unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
+            header[12] == BGZF_ID1 &&
+            header[13] == BGZF_ID2 &&
+            unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+}
+
+static
+int
+read_block(BGZF* fp)
+{
+    byte header[BLOCK_HEADER_LENGTH];
+    int64_t block_address = ftello(fp->file);
+    int count = fread(header, 1, sizeof(header), fp->file);
+    if (count == 0) {
+        fp->block_length = 0;
+        return 0;
+    }
+    if (count != sizeof(header)) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+    if (!check_header(header)) {
+        report_error(fp, "invalid block header");
+        return -1;
+    }
+    int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+    byte* compressed_block = (byte*) fp->compressed_block;
+    memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+    int remaining = block_length - BLOCK_HEADER_LENGTH;
+    count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
+    if (count != remaining) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+    count = inflate_block(fp, block_length);
+    if (count < 0) {
+        return -1;
+    }
+    if (fp->block_length != 0) {
+        // Do not reset offset if this read follows a seek.
+        fp->block_offset = 0;
+    }
+    fp->block_address = block_address;
+    fp->block_length = count;
+    return 0;
+}
+
+int
+bgzf_read(BGZF* fp, void* data, int length)
+{
+    if (length <= 0) {
+        return 0;
+    }
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for reading");
+        return -1;
+    }
+
+    int bytes_read = 0;
+    byte* output = data;
+    while (bytes_read < length) {
+        int available = fp->block_length - fp->block_offset;
+        if (available <= 0) {
+            if (read_block(fp) != 0) {
+                return -1;
+            }
+            available = fp->block_length - fp->block_offset;
+            if (available <= 0) {
+                break;
+            }
+        }
+        int copy_length = min(length-bytes_read, available);
+        byte* buffer = fp->uncompressed_block;
+        memcpy(output, buffer + fp->block_offset, copy_length);
+        fp->block_offset += copy_length;
+        output += copy_length;
+        bytes_read += copy_length;
+    }
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = ftello(fp->file);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+    return bytes_read;
+}
+
+static
+int
+flush_block(BGZF* fp)
+{
+    while (fp->block_offset > 0) {
+        int block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) {
+            return -1;
+        }
+        int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+        if (count != block_length) {
+            report_error(fp, "write failed");
+            return -1;
+        }
+        fp->block_address += block_length;
+    }
+    return 0;
+}
+
+int
+bgzf_write(BGZF* fp, const void* data, int length)
+{
+    if (fp->open_mode != 'w') {
+        report_error(fp, "file not open for writing");
+        return -1;
+    }
+
+    if (fp->uncompressed_block == NULL) {
+        fp->uncompressed_block = malloc(fp->uncompressed_block_size);
+    }
+
+    const byte* input = data;
+    int block_length = fp->uncompressed_block_size;
+    int bytes_written = 0;
+    while (bytes_written < length) {
+        int copy_length = min(block_length - fp->block_offset, length - bytes_written);
+        byte* buffer = fp->uncompressed_block;
+        memcpy(buffer + fp->block_offset, input, copy_length);
+        fp->block_offset += copy_length;
+        input += copy_length;
+        bytes_written += copy_length;
+        if (fp->block_offset == block_length) {
+            if (flush_block(fp) != 0) {
+                break;
+            }
+        }
+    }
+    return bytes_written;
+}
+
+int
+bgzf_close(BGZF* fp)
+{
+    if (fp->open_mode == 'w') {
+        if (flush_block(fp) != 0) {
+            return -1;
+        }
+        if (fflush(fp->file) != 0) {
+            report_error(fp, "flush failed");
+            return -1;
+        }
+    }
+    if (fp->owned_file) {
+        if (fclose(fp->file) != 0) {
+            return -1;
+        }
+    }
+    free(fp->uncompressed_block);
+    free(fp->compressed_block);
+    free(fp);
+    return 0;
+}
+
+int64_t
+bgzf_tell(BGZF* fp)
+{
+    return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
+}
+
+int64_t
+bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for read");
+        return -1;
+    }
+    if (where != SEEK_SET) {
+        report_error(fp, "unimplemented seek option");
+        return -1;
+    }
+    int block_offset = pos & 0xFFFF;
+    int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+    if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+        report_error(fp, "seek failed");
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block is not loaded
+    fp->block_address = block_address;
+    fp->block_offset = block_offset;
+    return 0;
+}
+
diff --git a/bgzf.h b/bgzf.h
new file mode 100644
index 0000000..7e76bcb
--- /dev/null
+++ b/bgzf.h
@@ -0,0 +1,102 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+#include <stdbool.h>
+//#include "zutil.h"
+
+//typedef int8_t bool;
+
+typedef struct {
+    int file_descriptor;
+    char open_mode;  // 'r' or 'w'
+    bool owned_file;
+    FILE* file;
+    int uncompressed_block_size;
+    int compressed_block_size;
+    void* uncompressed_block;
+    void* compressed_block;
+    int64_t block_address;
+    int block_length;
+    int block_offset;
+    const char* error;
+} BGZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Open an existing file descriptor for reading or writing.
+ * Mode must be either "r" or "w".
+ * A subsequent bgzf_close will not close the file descriptor.
+ * Returns null on error.
+ */
+BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
+
+/*
+ * Open the specified file for reading or writing.
+ * Mode must be either "r" or "w".
+ * Returns null on error.
+ */
+BGZF* bgzf_open(const char* path, const char* __restrict mode);
+
+/*
+ * Close the BGZ file and free all associated resources.
+ * Does not close the underlying file descriptor if created with bgzf_fdopen.
+ * Returns zero on success, -1 on error.
+ */
+int bgzf_close(BGZF* fp);
+
+/*
+ * Read up to length bytes from the file storing into data.
+ * Returns the number of bytes actually read.
+ * Returns zero on end of file.
+ * Returns -1 on error.
+ */
+int bgzf_read(BGZF* fp, void* data, int length);
+
+/*
+ * Write length bytes from data to the file.
+ * Returns the number of bytes written.
+ * Returns -1 on error.
+ */
+int bgzf_write(BGZF* fp, const void* data, int length);
+
+/*
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ * Returns -1 on error.
+ */
+int64_t bgzf_tell(BGZF* fp);
+
+/*
+ * Set the file to read from the location specified by pos, which must
+ * be a value previously returned by bgzf_tell for this file (but not
+ * necessarily one returned by this file handle).
+ * The where argument must be SEEK_SET.
+ * Seeking on a file opened for write is not supported.
+ * Returns zero on success, -1 on error.
+ */
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif