]> git.donarmstrong.com Git - bamtools.git/commitdiff
Initial import.
authormikaels <mikaels@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Fri, 10 Apr 2009 15:07:49 +0000 (15:07 +0000)
committermikaels <mikaels@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Fri, 10 Apr 2009 15:07:49 +0000 (15:07 +0000)
git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@2 9efb377e-2e27-44b9-b91a-ec4abb80ed8b

12 files changed:
BamAlignment.h [new file with mode: 0644]
BamConversionMain.cpp [new file with mode: 0644]
BamReader.cpp [new file with mode: 0644]
BamReader.h [new file with mode: 0644]
BamReaderMain.cpp [new file with mode: 0644]
BamWriter.cpp [new file with mode: 0644]
BamWriter.h [new file with mode: 0644]
Makefile [new file with mode: 0644]
README [new file with mode: 0644]
STLUtilities.h [new file with mode: 0644]
bgzf.c [new file with mode: 0644]
bgzf.h [new file with mode: 0644]

diff --git a/BamAlignment.h b/BamAlignment.h
new file mode 100644 (file)
index 0000000..b299ab3
--- /dev/null
@@ -0,0 +1,92 @@
+// BamAlignment.h\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 20 March 2009\r
+\r
+#ifndef BAMALIGNMENT_H\r
+#define BAMALIGNMENT_H\r
+\r
+#include <stdint.h>\r
+\r
+// C++ includes\r
+#include <string>\r
+using std::string;\r
+\r
+#include <vector>\r
+using std::vector;\r
+\r
+struct CigarOp {\r
+       uint32_t Length;\r
+       char     Type;\r
+};\r
+\r
+struct RefData {\r
+  string       RefName;\r
+  unsigned int RefLength;\r
+  bool         RefHasAlignments;\r
+\r
+  // constructor\r
+  RefData(void)\r
+  : RefLength(0)\r
+  , RefHasAlignments(false)\r
+  { }\r
+};\r
+\r
+typedef vector<RefData> RefVector;\r
+\r
+struct BamAlignment {\r
+\r
+    // queries against alignment flag - see below for further detail\r
+    public:\r
+        bool IsPaired(void) const            { return ( (AlignmentFlag & PAIRED)        != 0 ); }\r
+        bool IsProperPair(void) const        { return ( (AlignmentFlag & PROPER_PAIR)   != 0 ); }\r
+        bool IsMapped(void) const            { return ( (AlignmentFlag & UNMAPPED)      == 0 ); }\r
+        bool IsMateMapped(void) const        { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
+        bool IsReverseStrand(void) const     { return ( (AlignmentFlag & REVERSE)       != 0 ); }\r
+        bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }\r
+        bool IsFirstMate(void) const         { return ( (AlignmentFlag & READ_1)        != 0 ); }\r
+        bool IsSecondMate(void) const        { return ( (AlignmentFlag & READ_2)        != 0 ); }\r
+        bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY)     == 0 ); }\r
+        bool IsFailedQC(void) const          { return ( (AlignmentFlag & QC_FAILED)     != 0 ); }\r
+        bool IsDuplicate(void) const         { return ( (AlignmentFlag & DUPLICATE)     != 0 ); }\r
+\r
+    // data members\r
+    public:\r
+        string       Name;           // read name\r
+        unsigned int Length;         // query length\r
+        string       QueryBases;     // original sequence ( produced from machine )\r
+        string       AlignedBases;   // aligned sequence ( with indels ) \r
+        string       Qualities;      // FASTQ qualities ( still in ASCII characters )\r
+               vector<string> Tags;\r
+        unsigned int RefID;          // ID for reference sequence\r
+        unsigned int Position;       // position on reference sequence where alignment starts\r
+        unsigned int Bin;            // bin in BAM file where this alignment resides\r
+        unsigned int MapQuality;     // mapping quality \r
+        unsigned int AlignmentFlag;  // see above for available queries\r
+               vector<CigarOp> CigarData;   // vector of CIGAR operations (length & type) )\r
+        unsigned int MateRefID;      // ID for reference sequence that mate was aligned to\r
+        unsigned int MatePosition;   // position that mate was aligned to\r
+        unsigned int InsertSize;     // mate pair insert size\r
+               \r
+\r
+    // alignment flag query constants\r
+    private:\r
+        enum { PAIRED        = 1,              // Alignment comes from paired-end data\r
+               PROPER_PAIR   = 2,              // Alignment passed paired-end resolution\r
+               UNMAPPED      = 4,              // Read is unmapped\r
+               MATE_UNMAPPED = 8,              // Mate is unmapped\r
+               REVERSE       = 16,             // Read is on reverse strand\r
+               MATE_REVERSE  = 32,             // Mate is on reverse strand\r
+               READ_1        = 64,             // This alignment is mate 1 of pair\r
+               READ_2        = 128,            // This alignment is mate 2 of pair\r
+               SECONDARY     = 256,            // This alignment is not the primary (best) alignment for read\r
+               QC_FAILED     = 512,            // Read did not pass prior quality control steps\r
+               DUPLICATE     = 1024            // Read is PCR duplicate\r
+        };\r
+};\r
+\r
+// commonly used vector in this library\r
+typedef vector< BamAlignment > BamAlignmentVector;\r
+\r
+#endif /* BAMALIGNMENT_H */\r
diff --git a/BamConversionMain.cpp b/BamConversionMain.cpp
new file mode 100644 (file)
index 0000000..457e58e
--- /dev/null
@@ -0,0 +1,46 @@
+#include <iostream>
+#include "BamReader.h"
+#include "BamWriter.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+
+       if(argc != 3) {
+               cout << "USAGE: " << argv[0] << " <input BAM file> <output BAM file>" << endl;
+               exit(1);
+       }
+
+       // localize our arguments
+       const char* inputFilename  = argv[1];
+       const char* outputFilename = argv[2];
+
+       // open our BAM reader
+       BamReader reader;
+       reader.SetFilename(inputFilename);
+
+       if(!reader.Open()) {
+               cout << "ERROR: Unable to open the BAM file (" << inputFilename << ")." << endl;
+               exit(1);
+       }
+
+       // retrieve the SAM header text
+       string samHeader = reader.GetHeaderText();
+
+       // retrieve the reference sequence vector
+       RefVector referenceSequences = reader.GetReferenceData();
+
+       // open the BAM writer
+       BamWriter writer;
+       writer.Open(outputFilename, samHeader, referenceSequences);
+
+       // copy all of the reads from the input file to the output file
+       BamAlignment al;
+       while(reader.GetNextAlignment(al)) writer.SaveAlignment(al);
+
+       // close our files
+       reader.Close();
+       writer.Close();
+
+       return 0;
+}
\ No newline at end of file
diff --git a/BamReader.cpp b/BamReader.cpp
new file mode 100644 (file)
index 0000000..8fba1a9
--- /dev/null
@@ -0,0 +1,653 @@
+// BamReader.cpp\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 6 April 2009\r
+\r
+#include "BamReader.h"\r
+#include <iostream>\r
+using std::cerr;\r
+using std::endl;\r
+\r
+// static character constants\r
+const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";\r
+const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";\r
+\r
+BamReader::BamReader(const char* filename, const char* indexFilename) \r
+       : m_filename( (char*)filename )\r
+       , m_indexFilename( (char*)indexFilename )\r
+       , m_file(0)\r
+       , m_index(NULL)\r
+       , m_headerText("")\r
+       , m_isOpen(false)\r
+       , m_isIndexLoaded(false)\r
+       , m_isRegionSpecified(false)\r
+       , m_isCalculateAlignedBases(true)\r
+       , m_currentRefID(0)\r
+       , m_currentLeft(0)\r
+       , m_alignmentsBeginOffset(0)\r
+{\r
+       Open();\r
+}\r
+\r
+BamReader::~BamReader(void) {\r
+\r
+       // close file\r
+       if ( m_isOpen ) { Close(); }\r
+}\r
+\r
+// open BAM file\r
+bool BamReader::Open(void) {\r
+\r
+       if (!m_isOpen && m_filename != NULL ) {\r
+\r
+               // open file\r
+               m_file = bam_open(m_filename, "r"); \r
+               \r
+               // get header info && index info\r
+               if ( (m_file != NULL) && LoadHeader() ) {\r
+\r
+                       // save file offset where alignments start\r
+                       m_alignmentsBeginOffset = bam_tell(m_file);\r
+                       \r
+                       // set open flag\r
+                       m_isOpen = true;\r
+               }\r
+\r
+               // try to open (and load) index data, if index file given\r
+               if ( m_indexFilename != NULL ) {\r
+                       OpenIndex();\r
+               }\r
+       }\r
+\r
+       return m_isOpen;\r
+}\r
+\r
+bool BamReader::OpenIndex(void) {\r
+\r
+       if ( m_indexFilename && !m_isIndexLoaded ) {\r
+               m_isIndexLoaded = LoadIndex();\r
+       }\r
+       return m_isIndexLoaded;\r
+}\r
+\r
+// close BAM file\r
+bool BamReader::Close(void) {\r
+       \r
+       if (m_isOpen) {\r
+\r
+               // close file\r
+               int ret = bam_close(m_file);\r
+               \r
+               // delete index info\r
+               if ( m_index != NULL) { delete m_index; }\r
+\r
+               // clear open flag\r
+               m_isOpen = false;\r
+\r
+               // clear index flag\r
+               m_isIndexLoaded = false;\r
+\r
+               // clear region flag\r
+               m_isRegionSpecified = false;\r
+\r
+               // return success/fail of bam_close\r
+               return (ret == 0);\r
+       } \r
+\r
+       return true;\r
+}\r
+\r
+// get BAM filename\r
+const char* BamReader::Filename(void) const { \r
+       return (const char*)m_filename; \r
+}\r
+\r
+// set BAM filename\r
+void BamReader::SetFilename(const char* filename) {\r
+       m_filename = (char*)filename;\r
+}\r
+\r
+// get BAM Index filename\r
+const char* BamReader::IndexFilename(void) const { \r
+       return (const char*)m_indexFilename; \r
+}\r
+\r
+// set BAM Index filename\r
+void BamReader::SetIndexFilename(const char* indexFilename) {\r
+       m_indexFilename = (char*)indexFilename;\r
+}\r
+\r
+// return full header text\r
+const string BamReader::GetHeaderText(void) const { \r
+       return m_headerText; \r
+}\r
+\r
+// return number of reference sequences in BAM file\r
+const int BamReader::GetReferenceCount(void) const { \r
+       return m_references.size();\r
+}\r
+\r
+// get RefID from reference name\r
+const int BamReader::GetRefID(string refName) const { \r
+       \r
+       vector<string> refNames;\r
+       RefVector::const_iterator refIter = m_references.begin();\r
+    RefVector::const_iterator refEnd  = m_references.end();\r
+    for ( ; refIter != refEnd; ++refIter) {\r
+               refNames.push_back( (*refIter).RefName );\r
+    }\r
+\r
+       // return 'index-of' refName (if not found, returns refNames.size())\r
+       return Index( refNames.begin(), refNames.end(), refName );\r
+}\r
+\r
+const RefVector BamReader::GetReferenceData(void) const {\r
+       return m_references;\r
+}\r
+\r
+bool BamReader::Jump(int refID, unsigned int left) {\r
+\r
+       // if index available, and region is valid\r
+       if ( m_isIndexLoaded && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { \r
+               m_currentRefID = refID;\r
+               m_currentLeft  = left;\r
+               m_isRegionSpecified = true;\r
+               return ( bam_seek(m_file, GetOffset(m_currentRefID, m_currentLeft), SEEK_SET) == 0 );\r
+       }\r
+       return false;\r
+}\r
+\r
+bool BamReader::Rewind(void) {\r
+\r
+       int refID = 0;\r
+       int refCount = m_references.size();\r
+       for ( ; refID < refCount; ++refID ) {\r
+               if ( m_references.at(refID).RefHasAlignments ) { break; } \r
+       }\r
+\r
+       m_currentRefID = refID;\r
+       m_currentLeft = 0;\r
+       m_isRegionSpecified = false;\r
+\r
+       return ( bam_seek(m_file, m_alignmentsBeginOffset, SEEK_SET) == 0 );\r
+}      \r
+\r
+// get next alignment from specified region\r
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {\r
+\r
+       // try to load 'next' read\r
+       if ( LoadNextAlignment(bAlignment) ) {\r
+\r
+               // if specified region, check for region overlap\r
+               if ( m_isRegionSpecified ) {\r
+\r
+                       // if overlap, return true\r
+                       if ( IsOverlap(bAlignment) ) { return true; }\r
+                       // if not, try the next alignment\r
+                       else { return GetNextAlignment(bAlignment); }\r
+               } \r
+\r
+               // not using region, valid read detected, return success\r
+               else { return true; }\r
+       }\r
+\r
+       // no valid alignment to load\r
+       return false;\r
+}\r
+\r
+void BamReader::SetCalculateAlignedBases(bool flag) {\r
+       m_isCalculateAlignedBases = flag;\r
+}\r
+\r
+int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {\r
+\r
+       // get region boundaries\r
+       uint32_t begin = left;\r
+       uint32_t end   = m_references.at(refID).RefLength - 1;\r
+\r
+       // initialize list, bin '0' always a valid bin\r
+       int i = 0;\r
+       list[i++] = 0;\r
+\r
+       // get rest of bins that contain this region\r
+       unsigned int k;\r
+       for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { list[i++] = k; }\r
+       for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { list[i++] = k; }\r
+       for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { list[i++] = k; }\r
+       for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { list[i++] = k; }\r
+       for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
+       \r
+       // return number of bins stored\r
+       return i;\r
+}\r
+\r
+uint32_t BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {\r
+\r
+       // initialize alignment end to starting position\r
+       uint32_t alignEnd = position;\r
+\r
+       // iterate over cigar operations\r
+       vector<CigarOp>::const_iterator cigarIter = cigarData.begin();\r
+       vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();\r
+       for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
+               if ( (*cigarIter).Type == 'M' || (*cigarIter).Type == 'D' || (*cigarIter).Type == 'N') {\r
+                       alignEnd += (*cigarIter).Length;\r
+               }\r
+       }\r
+       return alignEnd;\r
+}\r
+\r
+uint64_t BamReader::GetOffset(int refID, unsigned int left) {\r
+\r
+       //  make space for bins\r
+       uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);                 \r
+       \r
+       // returns number of bins overlapping (left, right)\r
+       // stores indices of those bins in 'bins'\r
+       int numBins = BinsFromRegion(refID, left, bins);                                \r
+\r
+       // access index data for refID\r
+       RefIndex* refIndex = m_index->at(refID);\r
+\r
+       // get list of BAM bins for this reference sequence\r
+       BinVector* refBins = refIndex->first;\r
+\r
+       sort( refBins->begin(), refBins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );\r
+\r
+       // declare ChunkVector\r
+       ChunkVector regionChunks;\r
+\r
+       // declaure LinearOffsetVector\r
+       LinearOffsetVector* linearOffsets = refIndex->second;\r
+\r
+       // some sort of linear offset vs bin index voodoo... not completely sure what's going here\r
+       uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);\r
+\r
+       BinVector::iterator binBegin = refBins->begin();\r
+       BinVector::iterator binEnd   = refBins->end();\r
+\r
+       // iterate over bins overlapping region, count chunks\r
+       for (int i = 0; i < numBins; ++i) {\r
+               \r
+               // look for bin with ID=bin[i]\r
+               BinVector::iterator binIter = binBegin;\r
+\r
+               for ( ; binIter != binEnd; ++binIter ) {\r
+               \r
+                       // if found, increment n_off by number of chunks for each bin\r
+                       if ( (*binIter).first == (uint32_t)bins[i] ) { \r
+                               \r
+                               // iterate over chunks in that bin\r
+                               ChunkVector* binChunks = (*binIter).second;\r
+                               \r
+                               ChunkVector::iterator chunkIter = binChunks->begin();\r
+                               ChunkVector::iterator chunkEnd  = binChunks->end();\r
+                               for ( ; chunkIter != chunkEnd; ++chunkIter) {\r
+                               \r
+                                       // if right bound of pair is greater than min_off (linear offset value), store pair\r
+                                       if ( (*chunkIter).second > minOffset) { \r
+                                               regionChunks.push_back( (*chunkIter) ); \r
+                                       }\r
+                               }\r
+                       }\r
+               }\r
+       }\r
+\r
+       // clean up temp array\r
+       free(bins);\r
+\r
+       // there should be at least 1\r
+       assert(regionChunks.size() > 0);\r
+\r
+       // sort chunks by start position\r
+       sort ( regionChunks.begin(), regionChunks.end(), LookupKeyCompare<uint64_t, uint64_t>() );\r
+\r
+       // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing\r
+       int numOffsets = regionChunks.size();   \r
+       for (int i = 1; i < numOffsets; ++i) {\r
+               if ( regionChunks.at(i-1).second >= regionChunks.at(i).first ) {\r
+                       regionChunks.at(i-1).second = regionChunks.at(i).first;\r
+               }\r
+       }\r
+       \r
+       // merge adjacent chunks\r
+       int l = 0;\r
+       for (int i = 1; i < numOffsets; ++i) {\r
+               // if adjacent, expand boundaries of (merged) chunk\r
+               if ( (regionChunks.at(l).second>>16) == (regionChunks.at(i).first>>16) ) {\r
+                       regionChunks.at(l).second = regionChunks.at(i).second;\r
+               }\r
+               // else, move on to next (merged) chunk index\r
+               else { regionChunks.at(++l) = regionChunks.at(i); }\r
+       }\r
+\r
+       // return beginning file offset of first chunk for region\r
+       return regionChunks.at(0).first;\r
+}\r
+\r
+bool BamReader::IsOverlap(BamAlignment& bAlignment) {\r
+\r
+       // if on different reference sequence, quit\r
+       if ( bAlignment.RefID != (unsigned int)m_currentRefID ) { return false; }\r
+\r
+       // read starts after left boundary\r
+       if ( bAlignment.Position >= m_currentLeft) { return true; }\r
+\r
+       // get alignment end\r
+       uint32_t alignEnd = CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData);\r
+\r
+       // return whether alignment end overlaps left boundary\r
+       return ( alignEnd >= m_currentLeft );\r
+}\r
+\r
+bool BamReader::LoadHeader(void) {\r
+\r
+       // check to see if proper BAM header\r
+       char buf[4];\r
+       if (bam_read(m_file, buf, 4) != 4) { return false; }\r
+       if (strncmp(buf, "BAM\001", 4)) {\r
+               fprintf(stderr, "wrong header type!\n");\r
+               return false;\r
+       }\r
+       \r
+       // get BAM header text length\r
+       int32_t headerTextLength;\r
+       bam_read(m_file, &headerTextLength, 4);\r
+\r
+       // get BAM header text\r
+       char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
+       bam_read(m_file, headerText, headerTextLength);\r
+       m_headerText = (string)((const char*)headerText);\r
+       \r
+       // clean up calloc-ed temp variable\r
+       free(headerText);\r
+\r
+       // get number of reference sequences\r
+       int32_t numberRefSeqs;\r
+       bam_read(m_file, &numberRefSeqs, 4);\r
+       if (numberRefSeqs == 0) { return false; }\r
+\r
+       m_references.reserve((int)numberRefSeqs);\r
+       \r
+       // reference variables\r
+       int32_t  refNameLength;\r
+       char*    refName;\r
+       uint32_t refLength;\r
+\r
+       // iterate over all references in header\r
+       for (int i = 0; i != numberRefSeqs; ++i) {\r
+\r
+               // get length of reference name\r
+               bam_read(m_file, &refNameLength, 4);\r
+               refName = (char*)calloc(refNameLength, 1);\r
+\r
+               // get reference name and reference sequence length\r
+               bam_read(m_file, refName, refNameLength);\r
+               bam_read(m_file, &refLength, 4);\r
+\r
+               // store data for reference\r
+               RefData aReference;\r
+               aReference.RefName   = (string)((const char*)refName);\r
+               aReference.RefLength = refLength;\r
+               m_references.push_back(aReference);\r
+\r
+               // clean up calloc-ed temp variable\r
+               free(refName);\r
+       }\r
+       \r
+       return true;\r
+}\r
+\r
+bool BamReader::LoadIndex(void) {\r
+\r
+       // check to see if index file exists\r
+       FILE* indexFile;\r
+       if ( ( indexFile = fopen(m_indexFilename, "r") ) == 0 ) {\r
+               fprintf(stderr, "The alignment is not indexed. Please run SAMtools \'index\' command first.\n");\r
+               return false;\r
+       }\r
+\r
+       // see if index is valid BAM index\r
+       char magic[4];\r
+       fread(magic, 1, 4, indexFile);\r
+       if (strncmp(magic, "BAI\1", 4)) {\r
+               fprintf(stderr, "Problem with index - wrong \'magic\' number.\n");\r
+               fclose(indexFile);\r
+               return false;\r
+       }\r
+\r
+       // get number of reference sequences\r
+       uint32_t numRefSeqs;\r
+       fread(&numRefSeqs, 4, 1, indexFile);\r
+       \r
+       // intialize BamIndex data structure\r
+       m_index = new BamIndex;\r
+       m_index->reserve(numRefSeqs);\r
+\r
+       // iterate over reference sequences\r
+       for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
+               \r
+               // get number of bins for this reference sequence\r
+               int32_t numBins;\r
+               fread(&numBins, 4, 1, indexFile);\r
+               \r
+               if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }\r
+\r
+               // intialize BinVector\r
+               BinVector* bins = new BinVector;\r
+               bins->reserve(numBins);\r
+               \r
+               // iterate over bins for that reference sequence\r
+               for (int j = 0; j < numBins; ++j) {\r
+                       \r
+                       // get binID \r
+                       uint32_t binID;\r
+                       fread(&binID, 4, 1, indexFile);\r
+                       \r
+                       // get number of regionChunks in this bin\r
+                       uint32_t numChunks;\r
+                       fread(&numChunks, 4, 1, indexFile);\r
+                       \r
+                       // intialize ChunkVector\r
+                       ChunkVector* regionChunks = new ChunkVector;\r
+                       regionChunks->reserve(numChunks);\r
+                       \r
+                       // iterate over regionChunks in this bin\r
+                       for (unsigned int k = 0; k < numChunks; ++k) {\r
+                               \r
+                               // get chunk boundaries (left, right) \r
+                               uint64_t left;\r
+                               uint64_t right;\r
+                               fread(&left, 8, 1, indexFile);\r
+                               fread(&right, 8, 1, indexFile);\r
+                               \r
+                               // save ChunkPair\r
+                               regionChunks->push_back( ChunkPair(left, right) );\r
+                       }\r
+                       \r
+                       // save binID, chunkVector for this bin\r
+                       bins->push_back( BamBin(binID, regionChunks) );\r
+               }\r
+               \r
+               // load linear index for this reference sequence\r
+               \r
+               // get number of linear offsets\r
+               int32_t numLinearOffsets;\r
+               fread(&numLinearOffsets, 4, 1, indexFile);\r
+               \r
+               // intialize LinearOffsetVector\r
+               LinearOffsetVector* linearOffsets = new LinearOffsetVector;\r
+               linearOffsets->reserve(numLinearOffsets);\r
+               \r
+               // iterate over linear offsets for this reference sequeence\r
+               for (int j = 0; j < numLinearOffsets; ++j) {\r
+                       // get a linear offset\r
+                       uint64_t linearOffset;\r
+                       fread(&linearOffset, 8, 1, indexFile);\r
+                       // store linear offset\r
+                       linearOffsets->push_back(linearOffset);\r
+               }\r
+               \r
+               // store index data for that reference sequence\r
+               m_index->push_back( new RefIndex(bins, linearOffsets) );\r
+       }\r
+       \r
+       // close index file (.bai) and return\r
+       fclose(indexFile);\r
+       return true;\r
+}\r
+\r
+bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {\r
+\r
+       // check valid alignment block header data\r
+       int32_t block_len;\r
+       int32_t ret;\r
+       uint32_t x[8];\r
+\r
+       int32_t bytesRead = 0;\r
+\r
+       // read in the 'block length' value, make sure it's not zero\r
+       if ( (ret = bam_read(m_file, &block_len, 4)) == 0 )        { return false; }\r
+       bytesRead += 4;\r
+\r
+       // read in core alignment data, make the right size of data was read \r
+       if ( bam_read(m_file, x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+       bytesRead += BAM_CORE_SIZE;\r
+\r
+       // set BamAlignment 'core' data\r
+       bAlignment.RefID         = x[0]; \r
+       bAlignment.Position      = x[1];\r
+       bAlignment.Bin           = x[2]>>16; \r
+       bAlignment.MapQuality    = x[2]>>8&0xff; \r
+       bAlignment.AlignmentFlag = x[3]>>16; \r
+       bAlignment.MateRefID     = x[5]; \r
+       bAlignment.MatePosition  = x[6]; \r
+       bAlignment.InsertSize    = x[7];\r
+\r
+       // fetch & store often-used lengths for character data parsing\r
+       unsigned int queryNameLength     = x[2]&0xff;\r
+       unsigned int numCigarOperations  = x[3]&0xffff;\r
+       unsigned int querySequenceLength = x[4];\r
+       \r
+       // get length of character data\r
+       int dataLength = block_len - BAM_CORE_SIZE;\r
+\r
+       // set up destination buffers for character data\r
+       uint8_t*  allCharData = (uint8_t*)calloc(sizeof(uint8_t), dataLength);\r
+       uint32_t* cigarData   = (uint32_t*)(allCharData+queryNameLength);\r
+       \r
+       // get character data - make sure proper data size was read\r
+       if (bam_read(m_file, allCharData, dataLength) != dataLength) { return false; }\r
+       else {\r
+\r
+               bytesRead += dataLength;\r
+\r
+               // clear out bases, qualities, aligned bases, and CIGAR\r
+               bAlignment.QueryBases.clear();\r
+               bAlignment.Qualities.clear();\r
+               bAlignment.AlignedBases.clear();\r
+               bAlignment.CigarData.clear();\r
+\r
+               // save name\r
+               bAlignment.Name = (string)((const char*)(allCharData));\r
+               \r
+               // save bases\r
+               char singleBase[2];\r
+               uint8_t* s = ( allCharData + (numCigarOperations*4) + queryNameLength);\r
+               for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
+                       // numeric to char conversion\r
+                       sprintf( singleBase, "%c", DNA_LOOKUP[ ( ( s[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ] );\r
+                       // append character data to Bases\r
+                       bAlignment.QueryBases.append( (const char*)singleBase );\r
+               }\r
+               \r
+               // save sequence length\r
+               bAlignment.Length = bAlignment.QueryBases.length();\r
+               \r
+               // save qualities\r
+               char singleQuality[4];\r
+               uint8_t* t = ( allCharData + (numCigarOperations*4) + queryNameLength + (querySequenceLength + 1)/2);\r
+               for (unsigned int i = 0; i < querySequenceLength; ++i) { \r
+                       // numeric to char conversion\r
+                       sprintf( singleQuality, "%c", ( t[i]+33 ) ); \r
+                       // append character data to Qualities\r
+                       bAlignment.Qualities.append( (const char*)singleQuality );\r
+               }\r
+               \r
+               // save CIGAR-related data;\r
+               int k = 0;\r
+               for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
+\r
+                       // build CigarOp struct\r
+                       CigarOp op;\r
+                       op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+                       op.Type   = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+                       // save CigarOp\r
+                       bAlignment.CigarData.push_back(op);\r
+\r
+                       // can skip this step if user wants to ignore\r
+                       if (m_isCalculateAlignedBases) {\r
+\r
+                               // build AlignedBases string\r
+                               switch (op.Type) {\r
+                                       \r
+                                       case ('M') : \r
+                                       case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) );      // for 'M', 'I' - write bases\r
+                                       case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases\r
+                                                                break;\r
+                                       \r
+                                       case ('D') : \r
+                                       case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;\r
+                                                                break;\r
+                                       \r
+                                       case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence\r
+                                                                k += op.Length;\r
+                                                                break;\r
+\r
+                                       case ('H') : break;                                                                                     // for 'H' - do nothing, move to next op\r
+                                       \r
+                                       default    : assert(false);     // shouldn't get here\r
+                                                                break;\r
+                               }\r
+                       }\r
+               }       \r
+       }\r
+       free(allCharData);\r
+\r
+/*\r
+       // (optional) read tag parsing data\r
+       string tag;\r
+       char data;\r
+       int i = 0;\r
+\r
+       // still data to read (auxiliary tags)\r
+       while ( bytesRead < block_len ) {\r
+\r
+               if ( bam_read(m_file, &data, 1) == 1 ) { \r
+                       \r
+                       ++bytesRead;\r
+\r
+                       if (bytesRead == block_len && data != '\0') {\r
+                               fprintf(stderr, "ERROR: Invalid termination of tag info at end of alignment block.\n");\r
+                               exit(1);\r
+                       }\r
+\r
+                       tag.append(1, data);\r
+                       if ( data == '\0' ) {\r
+                               bAlignment.Tags.push_back(tag);\r
+                               tag = "";\r
+                               i = 0;\r
+                       } else {\r
+                               if ( (i == 1) && (i == 2) ) { tag.append(1, ':'); }\r
+                               ++i;\r
+                       }\r
+               } else {\r
+                       fprintf(stderr, "ERROR: Could not read tag info.\n");\r
+                       exit(1);\r
+               }\r
+       }\r
+*/\r
+       return true;\r
+}\r
diff --git a/BamReader.h b/BamReader.h
new file mode 100644 (file)
index 0000000..786f4c2
--- /dev/null
@@ -0,0 +1,230 @@
+// BamReader.h\r
+\r
+/* The MIT License\r
+\r
+   Copyright (c) 2008 Genome Research Ltd (GRL).\r
+\r
+   Permission is hereby granted, free of charge, to any person obtaining\r
+   a copy of this software and associated documentation files (the\r
+   "Software"), to deal in the Software without restriction, including\r
+   without limitation the rights to use, copy, modify, merge, publish,\r
+   distribute, sublicense, and/or sell copies of the Software, and to\r
+   permit persons to whom the Software is furnished to do so, subject to\r
+   the following conditions:\r
+\r
+   The above copyright notice and this permission notice shall be\r
+   included in all copies or substantial portions of the Software.\r
+\r
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+   SOFTWARE.\r
+*/\r
+\r
+/*\r
+       Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package \r
+        (thus the carryover of above MIT license)\r
+       Contact: Derek Barnett <barnetde@bc.edu>\r
+*/\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 6 April 2009\r
+\r
+#ifndef BAMREADER_H\r
+#define BAMREADER_H\r
+\r
+// custom includes\r
+#include "BamAlignment.h"\r
+#include "STLUtilities.h"\r
+\r
+// C library includes\r
+#include <assert.h>\r
+#include <stdint.h>\r
+#include <stdio.h>\r
+#include <stdlib.h>\r
+#include <string.h>\r
+\r
+// BGZF library includes/defines\r
+#include "bgzf.h"\r
+typedef BGZF* BamFile;\r
+#define bam_open(f_name, mode)      bgzf_open(f_name, mode)\r
+#define bam_close(f_ptr)            bgzf_close(f_ptr)\r
+#define bam_read(f_ptr, buf, size)  bgzf_read(f_ptr, buf, size)\r
+#define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)\r
+#define bam_tell(f_ptr)             bgzf_tell(f_ptr)\r
+#define bam_seek(f_ptr, pos, dir)   bgzf_seek(f_ptr, pos, dir)\r
+\r
+// size of alignment data block in BAM file (bytes)\r
+#define BAM_CORE_SIZE 32\r
+\r
+// BAM indexing constants\r
+#define MAX_BIN           37450 // =(8^6-1)/7+1\r
+#define BAM_MIN_CHUNK_GAP 32768 \r
+#define BAM_LIDX_SHIFT    14\r
+\r
+// CIGAR-retrieval mask/shift constants\r
+#define BAM_CIGAR_SHIFT 4\r
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)\r
+\r
+// CIGAR-operation types\r
+#define BAM_CMATCH      0\r
+#define BAM_CINS        1\r
+#define BAM_CDEL        2\r
+#define BAM_CREF_SKIP   3\r
+#define BAM_CSOFT_CLIP  4\r
+#define BAM_CHARD_CLIP  5\r
+#define BAM_CPAD        6\r
+\r
+// --------------------------- //\r
+// Bam header info\r
+// --------------------------- //\r
+\r
+// --------------------------- //\r
+// BamIndex-related typedefs\r
+// --------------------------- //\r
+\r
+// offset for linear indexing\r
+typedef vector<uint64_t> LinearOffsetVector;\r
+\r
+// chunk boundaries\r
+typedef pair<uint64_t, uint64_t> ChunkPair;\r
+// list of chunks in a BAM bin\r
+typedef vector<ChunkPair> ChunkVector;\r
+\r
+// BAM bins for a reference sequence\r
+// replaces khash - uint32_t is key, ChunkVector is value\r
+typedef pair<uint32_t, ChunkVector*> BamBin;\r
+typedef vector<BamBin> BinVector;\r
+\r
+// each reference sequence has a BinVector and LinearOffsetVector\r
+typedef pair<BinVector*, LinearOffsetVector*> RefIndex;\r
+\r
+// full BamIndex defined as: \r
+typedef vector<RefIndex*> BamIndex;\r
+\r
+// ---------------------------------------------------------------------------//\r
+\r
+class BamReader {\r
+       \r
+       public:\r
+               // constructors\r
+               BamReader(const char* fileName = NULL, const char* indexFilename = NULL);\r
+\r
+       public:\r
+               // destructor\r
+               ~BamReader(void);\r
+       \r
+       // BAM interface methods\r
+       public:\r
+\r
+               // ----------------------- //\r
+               // File manipulation\r
+               // ----------------------- //\r
+               \r
+               // open BAM file (automatically opens index if provided)\r
+               bool Open(void);\r
+               \r
+               // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)\r
+               bool OpenIndex(void);\r
+               \r
+               // close BAM file\r
+               bool Close(void);\r
+               \r
+               // get BAM filename\r
+               const char* Filename(void) const;\r
+       \r
+               // set BAM filename\r
+               void SetFilename(const char*);\r
+\r
+               // get BAM Index filename\r
+               const char* IndexFilename(void) const;\r
+               \r
+               // set BAM Index filename\r
+               void SetIndexFilename(const char*);\r
+\r
+               // ----------------------- //\r
+               // Access BAM header\r
+               // ----------------------- //\r
+               \r
+               // return full header text\r
+               const string GetHeaderText(void) const;\r
+               \r
+               // --------------------------------- //\r
+               // Access reference sequence info\r
+               // --------------------------------- //\r
+               \r
+               // return number of reference sequences in BAM file\r
+               const int GetReferenceCount(void) const;\r
+\r
+               // return vector of RefData entries\r
+               const RefVector GetReferenceData(void) const;\r
+\r
+               // get refID from reference name\r
+               const int GetRefID(string refName) const;               \r
+               \r
+               // ----------------------------------------- //\r
+               // File position moving\r
+               // ----------------------------------------- //\r
+\r
+               // jumps to 'left' position on refID\r
+               // actually jumps before position, so reads that overlap 'left' are included as well\r
+               // 'left' defaults to reference begin if not specified\r
+               bool Jump(int refID, unsigned int left = 0);\r
+\r
+               // Jump to beginning of BAM file, clears any region previously set by Jump()\r
+               bool Rewind(void);\r
+               \r
+               // ------------------------------ //\r
+               // Access alignments\r
+               // ------------------------------ //\r
+               \r
+               // get next alignment\r
+               bool GetNextAlignment(BamAlignment& read);\r
+\r
+               // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded\r
+               void SetCalculateAlignedBases(bool);\r
+\r
+       // internal utility methods\r
+       private:\r
+               int      BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);\r
+               uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);\r
+               uint64_t GetOffset(int, unsigned int);\r
+               bool     IsOverlap(BamAlignment&);\r
+               bool     LoadHeader(void);\r
+               bool     LoadIndex(void);\r
+               bool     LoadNextAlignment(BamAlignment&);\r
+\r
+       private:        \r
+               // main BAM reader components\r
+               char*      m_filename;\r
+               char*      m_indexFilename;\r
+               BamFile    m_file;\r
+               BamIndex*  m_index;\r
+               RefVector  m_references;\r
+               string     m_headerText;\r
+\r
+               // state flags\r
+               bool m_isOpen;                           // BAM file is open for processing\r
+               bool m_isIndexLoaded;            // BAM Index data is loaded and available for processing\r
+               bool m_isRegionSpecified;        // a region has been specified - specifically, a user has called Jump()\r
+               bool m_isCalculateAlignedBases;  // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)\r
+\r
+               // region values\r
+               int m_currentRefID;\r
+               unsigned int m_currentLeft;\r
+\r
+               // file offset of 1st read in BAM file\r
+               int64_t m_alignmentsBeginOffset;\r
+\r
+       private:\r
+               // BAM character constants\r
+               static const char* DNA_LOOKUP;\r
+               static const char* CIGAR_LOOKUP;\r
+};\r
+\r
+#endif /* BAMREADER_H */\r
diff --git a/BamReaderMain.cpp b/BamReaderMain.cpp
new file mode 100644 (file)
index 0000000..bdf4e60
--- /dev/null
@@ -0,0 +1,255 @@
+// BamReaderMain.cpp\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 6 April 2009\r
+\r
+#include "BamReader.h"\r
+\r
+// C++ includes\r
+#include <iostream>\r
+using std::cerr;\r
+using std::cout;\r
+using std::endl;\r
+\r
+#include <vector>\r
+using std::vector;\r
+\r
+#include <string>\r
+using std::string;\r
+\r
+int main(int argc, char* argv[]) {\r
+\r
+       // check command line parameters\r
+       if (argc != 3) {\r
+               cerr << endl;\r
+               cerr << "Usage: BamReaderTest <bamFile> <bamIndexFile>" << endl;\r
+               cerr << endl;\r
+               exit(1);\r
+       }\r
+\r
+       // get filenames from command line\r
+       const char* bamFilename = argv[1];\r
+       const char* bamIndexFilename = argv[2];\r
+\r
+       string refName;\r
+       int refID;\r
+       unsigned int leftBound;\r
+       unsigned int rightBound;\r
+\r
+       int alignmentCount;\r
+\r
+       vector<string> refNames;\r
+\r
+       BamAlignment bAlignment;\r
+       BamAlignmentVector alignments;\r
+\r
+       RefVector references;\r
+\r
+       // ------------------------------------------------------------------------------------------------------ //\r
+       // Declare BamReader & open BAM file - automatically loads up header and index file (.bai) information\r
+       // ------------------------------------------------------------------------------------------------------ //\r
+       \r
+       BamReader bReader(bamFilename, bamIndexFilename);\r
+\r
+       cerr << endl;\r
+       cerr << "Opening BamReader for BAM file: " << bamFilename << " ..... ";\r
+       \r
+       if ( bReader.Open() ) { \r
+               cerr << "ok" << endl; \r
+       }\r
+       else {\r
+               cerr << "error" << endl;\r
+               exit(1);\r
+       }\r
+       \r
+       // ------------------------------------------------------------ //\r
+       // Find out how many reference sequences are in BAM file\r
+       // ------------------------------------------------------------ //\r
+       \r
+       references = bReader.GetReferenceData();\r
+\r
+       cerr << endl;\r
+       cerr << "Total number of ref seqs: " << references.size() << endl;\r
+       \r
+       // ---------------------------------------------------------------------------- //\r
+       // Get the names/lengths of all the reference sequences that have alignments\r
+       // ---------------------------------------------------------------------------- //\r
+       \r
+       cerr << endl;\r
+       cerr << "All ref sequences with alignments:" << endl;\r
+       cerr << endl;\r
+\r
+\r
+       if ( !references.empty() ) {\r
+               \r
+               RefVector::iterator refIter = references.begin();\r
+               RefVector::iterator refEnd  = references.end();\r
+               \r
+               refID = 0;\r
+               // iterate over reference names, print to STDERR\r
+               for( ; refIter != refEnd; ++refIter) {\r
+\r
+                       if ( (*refIter).RefHasAlignments ) {\r
+                               cerr << "ID:     " << refID << endl;\r
+                               cerr << "Name:   " << (*refIter).RefName << endl;\r
+                               cerr << "Length: " << (*refIter).RefLength << endl;\r
+                               cerr << endl;\r
+                       }\r
+\r
+                       ++refID;\r
+               }\r
+       }\r
+\r
+       // --------------------------------------------------------------------------------------------- //\r
+       // Get the SAM-style header text, if available (not available in the example file shown here)\r
+       // --------------------------------------------------------------------------------------------- //     \r
+       \r
+       cerr << "------------------------------------" << endl;\r
+       cerr << "SAM header text: " << endl;\r
+       \r
+       // get (SAM) header text\r
+       string header = bReader.GetHeaderText();\r
+       \r
+       cerr << ( (header.empty()) ? "no header data" : header ) << endl;\r
+       cerr << "------------------------------------" << endl;\r
+\r
+       // --------------------------------------------------------------------------------------------- //\r
+       // Here we start accessing alignments\r
+       // The first method shows how to iterate over all reads in a BamFile\r
+       // This method will work on any BAM file (sorted/non-sorted, with/without an index)\r
+       // --------------------------------------------------------------------------------------------- //\r
+\r
+       // Call Rewind() to make sure you're at the 1st alignment in the file\r
+       // Please note, however, it's not necessary in this case, since file pointer initially set to 1st alignment\r
+       // but this is probably a good habit to develop to ensure correctness and make your intent obvious in the code\r
+\r
+       cerr << "Getting (up to) first 1000 alignments" << endl;\r
+\r
+       // start at 1st alignment\r
+       if ( bReader.Rewind() ) {\r
+               \r
+               alignmentCount = 0;\r
+               while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
+               \r
+                       // disregard unmapped alignments\r
+                       if ( bAlignment.IsMapped() ) {\r
+\r
+                               ++alignmentCount;\r
+                               \r
+                               cout << "----------------------------" << endl;\r
+                               cout << "Alignment " << alignmentCount << endl;\r
+                               cout << bAlignment.Name << endl;\r
+                               cout << bAlignment.AlignedBases << endl;\r
+                               cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
+                               \r
+                               cout << "Cigar Data: " << endl;\r
+\r
+                               vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
+                               vector<CigarOp>::const_iterator cigarEnd  = bAlignment.CigarData.end();\r
+                               for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
+                                       cout << "Type: " << (*cigarIter).Type << "\tLength: " << (*cigarIter).Length << endl;\r
+                               }\r
+\r
+                               cout << "Tags: " << endl;\r
+                               vector<string>::const_iterator tagIter = bAlignment.Tags.begin();\r
+                               vector<string>::const_iterator tagEnd  = bAlignment.Tags.end();\r
+                               for ( ; tagIter != tagEnd; ++tagIter ) {\r
+                                       cout << (*tagIter) << endl;\r
+                               }\r
+                       }\r
+               }\r
+\r
+               cerr << "Found " << alignmentCount << " alignments." << endl;\r
+       } else { cerr << "Could not rewind" << endl; }\r
+\r
+       // ---------------------------------------------------------------------------------------------------------- //\r
+       // You can iterate over each individual alignment that overlaps a specified region\r
+       // Set the refID & left boundary parameters using Jump(refID, left)\r
+       // Jump() actually positions the file pointer actually before the left boundary\r
+       // This ensures that reads beginning before, but overlapping 'left' are included as well \r
+       // Client is responsible for setting/checking right boundary - see below\r
+       // ---------------------------------------------------------------------------------------------------------- //\r
+/*\r
+       cerr << "Jumping to region" << endl;\r
+\r
+       // get refID using a known refName\r
+       refName = "seq1";\r
+       refID = bReader.GetRefID(refName);\r
+       if (refID == (int)references.size()) { \r
+               cerr << "Reference " << refName << " not found." << endl;\r
+               exit(1);\r
+       }\r
+\r
+       // set left boundary\r
+       leftBound = 500;\r
+\r
+       // set right boundary - either user-specified number to set a discrete region\r
+       //                      OR you can query the BamReader for the end of the reference\r
+       rightBound = references.at(refID).RefLength;\r
+\r
+       cerr << endl;\r
+       cerr << "Iterating over alignments on reference: " << refName << " from " << leftBound << " to ref end (" << rightBound << ")" << endl;\r
+\r
+       // set region - specific region on reference\r
+       if ( bReader.Jump(refID, leftBound) ) { \r
+\r
+               alignmentCount = 0;\r
+               while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) {\r
+               \r
+                       if ( bAlignment.IsMapped() ) {\r
+\r
+                               ++alignmentCount;\r
+                       \r
+                               if ( (alignmentCount % 100000) == 0) { cerr << "Retrieved " << alignmentCount << " so far..." << endl; }\r
+                       \r
+                               cout << "----------------------------" << endl;\r
+                               cout << "Alignment " << alignmentCount << endl;\r
+                               cout << bAlignment.Name << endl;\r
+                               cout << bAlignment.AlignedBases << endl;\r
+                               cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
+                       }\r
+               }\r
+\r
+               cerr << "Found " << alignmentCount << " alignments." << endl; \r
+       } else { cerr << "Could not jump to region specified" << endl; }\r
+\r
+       // ----------------------------------------------------------------------------------------------------- //\r
+       // You can 'rewind' back to beginning of BAM file at any time \r
+       // ----------------------------------------------------------------------------------------------------- //\r
+\r
+       cerr << endl;\r
+       cerr << "Rewinding BAM file... then getting first 1000 alignments" << endl;\r
+\r
+       alignmentCount = 0;\r
+       if (bReader.Rewind() ) {\r
+               while ( bReader.GetNextAlignment(bAlignment) && (alignmentCount < 1000) ) {\r
+               \r
+                       // disregard unmapped alignments\r
+                       if ( bAlignment.IsMapped() ) {\r
+\r
+                               ++alignmentCount;\r
+                       \r
+                               cout << "----------------------------" << endl;\r
+                               cout << "Alignment " << alignmentCount << endl;\r
+                               cout << bAlignment.Name << endl;\r
+                               cout << bAlignment.AlignedBases << endl;\r
+                               cout << "Aligned to " << references.at(bAlignment.RefID).RefName << ":" << bAlignment.Position << endl;\r
+                       }\r
+               }\r
+\r
+               cerr << "Found " << alignmentCount << " alignments." << endl;\r
+       } else { cerr << "Could not rewind" << endl; }\r
+*/\r
+       // ---------------------------------------------------------------------- //\r
+       // Close BamReader object (releases internal header/index data) and exit\r
+       // ---------------------------------------------------------------------- //\r
+       \r
+       cerr << endl;\r
+       cerr << "Closing BAM file: " << bamFilename << endl;\r
+       \r
+       bReader.Close();\r
+\r
+       cerr << "Exiting..." << endl << endl;\r
+       return 0;\r
+}\r
diff --git a/BamWriter.cpp b/BamWriter.cpp
new file mode 100644 (file)
index 0000000..24544b3
--- /dev/null
@@ -0,0 +1,403 @@
+// ***************************************************************************
+// BamWriter (c) 2009 Michael Strömberg
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include "BamWriter.h"
+
+// constructor
+BamWriter::BamWriter(void)
+{}
+
+// destructor
+BamWriter::~BamWriter(void) {
+       if(mBGZF.IsOpen) BgzfClose();
+}
+
+// closes the BAM file
+void BamWriter::BgzfClose(void) {
+
+       mBGZF.IsOpen = false;
+
+       // flush the BGZF block
+       BgzfFlushBlock();
+
+       // flush and close
+       fflush(mBGZF.Stream);
+       fclose(mBGZF.Stream);
+}
+
+// compresses the current block
+int BamWriter::BgzfDeflateBlock(void) {
+
+       // initialize the gzip header
+       char* buffer = mBGZF.CompressedBlock;
+       unsigned int bufferSize = mBGZF.CompressedBlockSize;
+
+       memset(buffer, 0, 18);
+       buffer[0]  = GZIP_ID1;
+       buffer[1]  = (char)GZIP_ID2;
+       buffer[2]  = CM_DEFLATE;
+       buffer[3]  = FLG_FEXTRA;
+       buffer[9]  = (char)OS_UNKNOWN;
+       buffer[10] = BGZF_XLEN;
+       buffer[12] = BGZF_ID1;
+       buffer[13] = BGZF_ID2;
+       buffer[14] = BGZF_LEN;
+
+       // loop to retry for blocks that do not compress enough
+       int inputLength = mBGZF.BlockOffset;
+       int compressedLength = 0;
+
+       while(true) {
+
+               z_stream zs;
+               zs.zalloc    = NULL;
+               zs.zfree     = NULL;
+               zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
+               zs.avail_in  = inputLength;
+               zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
+               zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+               // initialize the zlib compression algorithm
+               if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
+                       printf("ERROR: zlib deflate initialization failed.\n");
+                       exit(1);
+               }
+
+               // compress the data
+               int status = deflate(&zs, Z_FINISH);
+               if(status != Z_STREAM_END) {
+                       deflateEnd(&zs);
+
+                       // reduce the input length and try again
+                       if(status == Z_OK) {
+                               inputLength -= 1024;
+                               if(inputLength < 0) {
+                                       printf("ERROR: input reduction failed.\n");
+                                       exit(1);
+                               }
+                               continue;
+                       }
+
+                       printf("ERROR: zlib deflate failed.\n");
+                       exit(1);
+               }
+
+               // finalize the compression routine
+               if(deflateEnd(&zs) != Z_OK) {
+                       printf("ERROR: deflate end failed.\n");
+                       exit(1);
+               }
+
+               compressedLength = zs.total_out;
+               compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+
+               if(compressedLength > MAX_BLOCK_SIZE) {
+                       printf("ERROR: deflate overflow.\n");
+                       exit(1);
+               }
+
+               break;
+       }
+
+       // store the compressed length
+       BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
+
+       // store the CRC32 checksum
+       unsigned int crc = crc32(0, NULL, 0);
+       crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
+       BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
+       BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+       // ensure that we have less than a block of data left
+       int remaining = mBGZF.BlockOffset - inputLength;
+       if(remaining > 0) {
+               if(remaining > inputLength) {
+                       printf("ERROR: remainder too large.\n");
+                       exit(1);
+               }
+
+               memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
+       }
+
+       mBGZF.BlockOffset = remaining;
+       return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BamWriter::BgzfFlushBlock(void) {
+
+       // flush all of the remaining blocks
+       while(mBGZF.BlockOffset > 0) {
+
+               // compress the data block
+               int blockLength = BgzfDeflateBlock();
+
+               // flush the data to our output stream
+               int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
+
+               if(numBytesWritten != blockLength) {
+                       printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
+                       exit(1);
+               }
+
+               mBGZF.BlockAddress += blockLength;
+       }
+}
+
+// opens the BAM file for writing
+void BamWriter::BgzfOpen(const string& filename) {
+
+       mBGZF.Stream = fopen(filename.c_str(), "wb");
+
+       if(!mBGZF.Stream) {
+               printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
+               exit(1);
+       }
+
+       mBGZF.IsOpen = true;
+}
+
+// writes the supplied data into the BGZF buffer
+unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
+
+       // initialize
+       unsigned int numBytesWritten = 0;
+       const char* input = data;
+       unsigned int blockLength = mBGZF.UncompressedBlockSize;
+
+       // copy the data to the buffer
+       while(numBytesWritten < dataLen) {
+               unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
+               char* buffer = mBGZF.UncompressedBlock;
+               memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
+
+               mBGZF.BlockOffset += copyLength;
+               input             += copyLength;
+               numBytesWritten   += copyLength;
+
+               if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
+       }
+
+       return numBytesWritten;
+}
+
+// closes the alignment archive
+void BamWriter::Close(void) {
+       if(mBGZF.IsOpen) BgzfClose();
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+       // initialize
+       const unsigned int numCigarOperations = cigarOperations.size();
+       packedCigar.resize(numCigarOperations * SIZEOF_INT);
+
+       // pack the cigar data into the string
+       unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+       unsigned int cigarOp;
+       vector<CigarOp>::const_iterator coIter;
+       for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
+
+               switch(coIter->Type) {
+         case 'M':
+                 cigarOp = BAM_CMATCH;
+                 break;
+         case 'I':
+                 cigarOp = BAM_CINS;
+                 break;
+         case 'D':
+                 cigarOp = BAM_CDEL;
+                 break;
+         case 'N':
+                 cigarOp = BAM_CREF_SKIP;
+                 break;
+         case 'S':
+                 cigarOp = BAM_CSOFT_CLIP;
+                 break;
+         case 'H':
+                 cigarOp = BAM_CHARD_CLIP;
+                 break;
+         case 'P':
+                 cigarOp = BAM_CPAD;
+                 break;
+         default:
+                 printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+                 exit(1);
+               }
+
+               *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+               pPackedCigar++;
+       }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+       // prepare the encoded query string
+       const unsigned int queryLen = query.size();
+       const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
+       encodedQuery.resize(encodedQueryLen);
+       char* pEncodedQuery = (char*)encodedQuery.data();
+       const char* pQuery = (const char*)query.data();
+
+       unsigned char nucleotideCode;
+       bool useHighWord = true;
+
+       while(*pQuery) {
+
+               switch(*pQuery) {
+                       case '=':
+                               nucleotideCode = 0;
+                               break;
+                       case 'A':
+                               nucleotideCode = 1;
+                               break;
+                       case 'C':
+                               nucleotideCode = 2;
+                               break;
+                       case 'G':
+                               nucleotideCode = 4;
+                               break;
+                       case 'T':
+                               nucleotideCode = 8;
+                               break;
+                       case 'N':
+                               nucleotideCode = 15;
+                               break;
+                       default:
+                               printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+                               exit(1);
+               }
+
+               // pack the nucleotide code
+               if(useHighWord) {
+                       *pEncodedQuery = nucleotideCode << 4;
+                       useHighWord = false;
+               } else {
+                       *pEncodedQuery |= nucleotideCode;
+                       pEncodedQuery++;
+                       useHighWord = true;
+               }
+
+               // increment the query position
+               pQuery++;
+       }
+}
+
+// opens the alignment archive
+void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
+
+       // open the BGZF file for writing
+       BgzfOpen(filename);
+
+       // ================
+       // write the header
+       // ================
+
+       // write the BAM signature
+       const unsigned char SIGNATURE_LENGTH = 4;
+       const char* BAM_SIGNATURE = "BAM\1";
+       BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
+
+       // write the SAM header text length
+       const unsigned int samHeaderLen = samHeader.size();
+       BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
+
+       // write the SAM header text
+       if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
+
+       // write the number of reference sequences
+       const unsigned int numReferenceSequences = referenceSequences.size();
+       BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
+
+       // =============================
+       // write the sequence dictionary
+       // =============================
+
+       RefVector::const_iterator rsIter;
+       for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+
+               // write the reference sequence name length
+               const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
+               BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
+
+               // write the reference sequence name
+               BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+               // write the reference sequence length
+               BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
+       }
+}
+
+// saves the alignment to the alignment archive
+void BamWriter::SaveAlignment(const BamAlignment& al) {
+
+       // initialize
+       const unsigned int nameLen            = al.Name.size() + 1;
+       const unsigned int queryLen           = al.QueryBases.size();
+       const unsigned int numCigarOperations = al.CigarData.size();
+
+       // create our packed cigar string
+       string packedCigar;
+       CreatePackedCigar(al.CigarData, packedCigar);
+       const unsigned int packedCigarLen = packedCigar.size();
+
+       // encode the query
+       string encodedQuery;
+       EncodeQuerySequence(al.QueryBases, encodedQuery);
+       const unsigned int encodedQueryLen = encodedQuery.size();
+
+       // create our read group tag
+       // TODO: add read group tag support when it shows up in the BamAlignment struct
+       //string readGroupTag;
+       //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
+       //readGroupTag.resize(readGroupTagLen);
+       //char* pReadGroupTag = (char*)readGroupTag.data();
+       //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
+
+       // assign the BAM core data
+       unsigned int buffer[8];
+       buffer[0] = al.RefID;
+       buffer[1] = al.Position;
+       buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
+       buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+       buffer[4] = queryLen;
+       buffer[5] = al.MateRefID;
+       buffer[6] = al.MatePosition;
+       buffer[7] = al.InsertSize;
+
+       // write the block size
+       const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
+       const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+
+       BgzfWrite((char*)&blockSize, SIZEOF_INT);
+
+       // write the BAM core
+       BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
+
+       // write the query name
+       BgzfWrite(al.Name.c_str(), nameLen);
+
+       // write the packed cigar
+       BgzfWrite(packedCigar.data(), packedCigarLen);
+
+       // write the encoded query sequence
+       BgzfWrite(encodedQuery.data(), encodedQueryLen);
+
+       // write the base qualities
+       string baseQualities = al.Qualities;
+       char* pBaseQualities = (char*)al.Qualities.data();
+       for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
+       BgzfWrite(pBaseQualities, queryLen);
+
+       // write the read group tag
+       // TODO: add read group tag support when it shows up in the BamAlignment struct
+       //BgzfWrite(readGroupTag.c_str(), readGroupTagLen);
+}
diff --git a/BamWriter.h b/BamWriter.h
new file mode 100644 (file)
index 0000000..422d1f9
--- /dev/null
@@ -0,0 +1,145 @@
+// ***************************************************************************
+// BamWriter (c) 2009 Michael Strömberg
+// Marth Lab, Deptartment of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include "BamAlignment.h"
+
+using namespace std;
+
+// our zlib constants
+#define GZIP_ID1             31
+#define GZIP_ID2            139
+#define CM_DEFLATE            8
+#define FLG_FEXTRA            4
+#define OS_UNKNOWN          255
+#define BGZF_XLEN             6
+#define BGZF_ID1             66
+#define BGZF_ID2             67
+#define BGZF_LEN              2
+#define GZIP_WINDOW_BITS    -15
+#define Z_DEFAULT_MEM_LEVEL   8
+
+// our BZGF constants
+#define BLOCK_HEADER_LENGTH    18
+#define BLOCK_FOOTER_LENGTH     8
+#define MAX_BLOCK_SIZE      65536
+#define DEFAULT_BLOCK_SIZE  65536
+
+// our BAM constants
+#define BAM_CORE_SIZE  32
+#define BAM_CMATCH      0
+#define BAM_CINS        1
+#define BAM_CDEL        2
+#define BAM_CREF_SKIP   3
+#define BAM_CSOFT_CLIP  4
+#define BAM_CHARD_CLIP  5
+#define BAM_CPAD        6
+#define BAM_CIGAR_SHIFT 4
+
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+
+// our variable sizes
+#define SIZEOF_INT 4
+
+// define our BZGF structure
+struct BgzfData {
+       unsigned int UncompressedBlockSize;
+       unsigned int CompressedBlockSize;
+       unsigned int BlockLength;
+       unsigned int BlockOffset;
+       uint64_t BlockAddress;
+       bool IsOpen;
+       FILE* Stream;
+       char* UncompressedBlock;
+       char* CompressedBlock;
+
+       // constructor
+       BgzfData(void)
+               : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+               , CompressedBlockSize(MAX_BLOCK_SIZE)
+               , BlockLength(0)
+               , BlockOffset(0)
+               , BlockAddress(0)
+               , IsOpen(false)
+               , Stream(NULL)
+               , UncompressedBlock(NULL)
+               , CompressedBlock(NULL)
+       {
+               try {
+                       CompressedBlock   = new char[CompressedBlockSize];
+                       UncompressedBlock = new char[UncompressedBlockSize];
+               } catch(bad_alloc&) {
+                       printf("ERROR: Unable to allocate memory for our BGZF object.\n");
+                       exit(1);
+               }
+       }
+
+       // destructor
+       ~BgzfData(void) {
+               if(CompressedBlock)   delete [] CompressedBlock;
+               if(UncompressedBlock) delete [] UncompressedBlock;
+       }
+};
+
+class BamWriter {
+public:
+       // constructor
+       BamWriter(void);
+       // destructor
+       ~BamWriter(void);
+       // closes the alignment archive
+       void Close(void);
+       // opens the alignment archive
+       void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
+       // saves the alignment to the alignment archive
+       void SaveAlignment(const BamAlignment& al);
+private:
+       // closes the BAM file
+       void BgzfClose(void);
+       // compresses the current block
+       int BgzfDeflateBlock(void);
+       // flushes the data in the BGZF block
+       void BgzfFlushBlock(void);
+       // opens the BAM file for writing
+       void BgzfOpen(const string& filename);
+       // packs an unsigned integer into the specified buffer
+       static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
+       // packs an unsigned short into the specified buffer
+       static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
+       // writes the supplied data into the BGZF buffer
+       unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
+       // calculates the minimum bin that contains a region [begin, end)
+       static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
+       // creates a packed cigar string from the supplied alignment
+       static void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
+       // encodes the supplied query sequence into 4-bit notation
+       static void EncodeQuerySequence(const string& query, string& encodedQuery);
+       // our BGZF output object
+       BgzfData mBGZF;
+};
+
+// packs an unsigned integer into the specified buffer
+inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
+       buffer[0] = (char)value;
+       buffer[1] = (char)(value >> 8);
+       buffer[2] = (char)(value >> 16);
+       buffer[3] = (char)(value >> 24);
+}
+
+// packs an unsigned short into the specified buffer
+inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
+       buffer[0] = (char)value;
+       buffer[1] = (char)(value >> 8);
+}
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..41ba820
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,35 @@
+CC=            gcc
+CXX=           g++
+CFLAGS=                -Wall -pg -O3 -m64
+CXXFLAGS=      $(CFLAGS)
+DFLAGS=                -D_IOLIB=2 #-D_NDEBUG
+OBJS=          BamReader.o bgzf.o
+PROG=          BamReaderTest
+INCLUDES=       
+ARFLAGS=       -crs
+LIBS=          -lz
+SUBDIRS=       .
+
+.SUFFIXES:.c .cpp .o
+
+.c.o:
+               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+.cpp.o:
+               $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all: $(PROG) BamConversion
+
+lib:libbambc.a
+
+libbambc.a:$(OBJS)
+               $(AR) $(ARFLAGS) $@ $(OBJS)
+
+BamReaderTest:lib BamReaderMain.o
+               $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc
+
+BamConversion: lib BamWriter.o BamConversionMain.o
+       $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc
+
+clean:
+               rm -fr gmon.out *.o *.a a.out $(PROG) *~
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..639bfd1
--- /dev/null
+++ b/README
@@ -0,0 +1,11 @@
+---------------
+README
+----------------
+
+Copy the 4 header files (*.h) and the library file (libbambc.a) to your directory.
+
+When you compile your program, add this to the g++ command:
+
+           -lz -L. -lbambc
+
+and you should be good to go.  (Those are lower case l's, as in 'lion'.)
diff --git a/STLUtilities.h b/STLUtilities.h
new file mode 100644 (file)
index 0000000..7a9fbc5
--- /dev/null
@@ -0,0 +1,146 @@
+// STLUtilities.h\r
+\r
+// Derek Barnett\r
+// Marth Lab, Boston College\r
+// Last modified: 19 March 2009\r
+\r
+#ifndef STL_UTILITIES_H\r
+#define STL_UTILITIES_H\r
+\r
+#include <algorithm>\r
+using std::distance;\r
+using std::find;\r
+using std::transform;\r
+\r
+#include <functional>\r
+using std::unary_function;\r
+\r
+#include <iterator>\r
+using std::iterator_traits;\r
+\r
+#include <string>\r
+using std::string;\r
+\r
+#include <utility>\r
+using std::pair;\r
+\r
+#include <vector>\r
+using std::vector;\r
+\r
+// -------------------------------------------------------------------------------- //\r
+// This file contains a sample of STL tricks I've gathered along the way.\r
+//\r
+// Many thanks throughout to 'Effective' series by Scott Meyers.\r
+// Some code is lifted (almost) verbatim from his texts where applicable.\r
+// ------------------------------------------------------------------------------- //\r
+\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+// STL containers will delete the values they hold when the container is deleted or goes out of scope\r
+// *** If the container contains pointers, however, they WILL NOT delete the actual pointed-to objects ***\r
+// This struct allows for easy algorithm processing (instead of pesky loops and iterator-boundary issues) to delete pointed-to objects (for any C++ type) that have been allocated with 'new')\r
+// Usage example:    for_each( container.begin(), container.end(), DeleteObject() ) \r
+//\r
+// Unless of course, you like quirky, hard-to-spot memory leaks... then feel free to disregard this little STL lesson   =)\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+struct DeleteObject {\r
+       template<typename T>\r
+       void operator() (const T* p) const { \r
+               delete p; \r
+       }\r
+};\r
+\r
+template<typename T>\r
+void ClearPointerVector(vector<T>& v) {\r
+       if ( !v.empty() ) {\r
+               for_each(v.begin(), v.end(), DeleteObject());\r
+               v.clear();\r
+       }\r
+}\r
+\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+// Query a vector (other containers? havent tried) for an element\r
+// Returns the index of that element\r
+// Returns vector::size() if not found\r
+// Works with reverse iterators as well... index is counted backward from last element though\r
+// --------------------------------------------------------------------------------------------------------------------------------- //\r
+template < typename InputIterator, typename EqualityComparable >\r
+typename iterator_traits<InputIterator>::difference_type\r
+Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {\r
+       return distance(begin, find(begin, end, item));\r
+}\r
+\r
+// ----------------------------------------------------------------------------------------------------------------------//\r
+// This next section is a sort of work-around for the bulky associative maps\r
+// The idea is to use a *SORTED* vector of pair objects as a lookup vector\r
+// LookupVector = vector< pair<Key, Value> > \r
+// ----------------------------------------------------------------------------------------------------------------------//\r
+\r
+// The following template classes allow a templatized comparison function for sorting & searching lookup vector\r
+\r
+// allows sorting by Key ( less<Key> )\r
+template <typename Key, typename Value>\r
+class LookupKeyCompare {\r
+\r
+       typedef pair< Key, Value > LookupData;\r
+       typedef typename LookupData::first_type Key_t;\r
+       \r
+       public:\r
+               bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }\r
+               bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }\r
+               bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }\r
+       private:\r
+               bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }\r
+};\r
+\r
+// allows sorting by Value ( less<Value> )\r
+template <typename Key, typename Value>\r
+class LookupValueCompare {\r
+\r
+       typedef pair< Key, Value > LookupData;\r
+       typedef typename LookupData::second_type Value_t;\r
+       \r
+       public:\r
+               bool operator() (const LookupData& lhs, const LookupData& rhs) const { return valueLess(lhs.second, rhs.second); }\r
+               bool operator() (const LookupData& lhs, const Value_t& k) const { return valueLess(lhs.second, k); }\r
+               bool operator() (const Value_t& k, const LookupData& rhs) const { return valueLess(k, rhs.second); }\r
+       private:\r
+               bool valueLess(const Value_t& k1, const Value_t& k2) const { return k1 < k2; }\r
+};\r
+\r
+// The following template functions/structs allow you to pull all keys or all values from this lookup structure\r
+\r
+// pull Key from a data pair\r
+template<typename Key, typename Value>\r
+struct RipKey:\r
+       public unary_function< pair<Key,Value>, Key> {\r
+               Key operator() (pair<Key,Value> dataPair) { \r
+                       return dataPair.first; \r
+               }\r
+};\r
+\r
+// pull Value from a data pair\r
+template<typename Key, typename Value>\r
+struct RipValue:\r
+       public unary_function< pair<Key,Value>, Value> {\r
+               Value operator() (pair<Key,Value> dataPair) { \r
+                       return dataPair.second; \r
+               }\r
+};\r
+\r
+// pull all Keys from lookup, store in dest (overwrite contents of dest)\r
+template <typename Key, typename Value>\r
+void RipKeys( vector< pair<Key,Value> >& lookup, vector<Key>& dest) {\r
+       dest.clear();\r
+       dest.reserve(lookup.size());\r
+       transform(lookup.begin(), lookup.end(), back_inserter(dest), RipKey<Key,Value>());\r
+}\r
+\r
+// pull all Values from lookup, store in dest (overwrite contents of dest)\r
+template <typename Key, typename Value>\r
+void RipValues( vector< pair<Key,Value> >& lookup, vector<Value>& dest) {\r
+       dest.clear();\r
+       dest.reserve(lookup.size());\r
+       transform(lookup.begin(), lookup.end(), back_inserter(dest), RipValue<Key,Value>());\r
+}\r
+\r
+#endif /* STL_UTILITIES_H */\r
diff --git a/bgzf.c b/bgzf.c
new file mode 100644 (file)
index 0000000..4314c70
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,488 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+
+typedef int8_t byte;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+    return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+    buffer[2] = value >> 16;
+    buffer[3] = value >> 24;
+}
+
+inline
+int
+min(int x, int y)
+{
+    return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+    fp->error = message;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+    FILE* file = fdopen(fd, "r");
+    BGZF* fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'r';
+    fp->owned_file = 0;
+    fp->file = file;
+    fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+    fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+static
+BGZF*
+open_write(int fd)
+{
+    FILE* file = fdopen(fd, "w");
+    BGZF* fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'w';
+    fp->owned_file = 0;
+    fp->file = file;
+    fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+    fp->uncompressed_block = NULL;
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+    BGZF* fp = NULL;
+    if (strcasecmp(mode, "r") == 0) {
+       int oflag = O_RDONLY;
+       int fd = open(path, oflag);
+        fp = open_read(fd);
+    } else if (strcasecmp(mode, "w") == 0) {
+       int oflag = O_WRONLY | O_CREAT | O_TRUNC;
+       int fd = open(path, oflag, 0644);
+        fp = open_write(fd);
+    }
+    if (fp != NULL) {
+        fp->owned_file = 1;
+    }
+    return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+    if (strcasecmp(mode, "r") == 0) {
+        return open_read(fd);
+    } else if (strcasecmp(mode, "w") == 0) {
+        return open_write(fd);
+    } else {
+        return NULL;
+    }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+    // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+    // Also adds an extra field that stores the compressed block length.
+
+    byte* buffer = fp->compressed_block;
+    int buffer_size = fp->compressed_block_size;
+
+    // Init gzip header
+    buffer[0] = GZIP_ID1;
+    buffer[1] = GZIP_ID2;
+    buffer[2] = CM_DEFLATE;
+    buffer[3] = FLG_FEXTRA;
+    buffer[4] = 0; // mtime
+    buffer[5] = 0;
+    buffer[6] = 0;
+    buffer[7] = 0;
+    buffer[8] = 0;
+    buffer[9] = OS_UNKNOWN;
+    buffer[10] = BGZF_XLEN;
+    buffer[11] = 0;
+    buffer[12] = BGZF_ID1;
+    buffer[13] = BGZF_ID2;
+    buffer[14] = BGZF_LEN;
+    buffer[15] = 0;
+    buffer[16] = 0; // placeholder for block length
+    buffer[17] = 0;
+
+    // loop to retry for blocks that do not compress enough
+    int input_length = block_length;
+    int compressed_length = 0;
+    while (1) {
+
+        z_stream zs;
+        zs.zalloc = NULL;
+        zs.zfree = NULL;
+        zs.next_in = fp->uncompressed_block;
+        zs.avail_in = input_length;
+        zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+        zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+        int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+                                  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+        if (status != Z_OK) {
+            report_error(fp, "deflate init failed");
+            return -1;
+        }
+        status = deflate(&zs, Z_FINISH);
+        if (status != Z_STREAM_END) {
+            deflateEnd(&zs);
+            if (status == Z_OK) {
+                // Not enough space in buffer.
+                // Can happen in the rare case the input doesn't compress enough.
+                // Reduce the amount of input until it fits.
+                input_length -= 1024;
+                if (input_length <= 0) {
+                    // should never happen
+                    report_error(fp, "input reduction failed");
+                    return -1;
+                }
+                continue;
+            }
+            report_error(fp, "deflate failed");
+            return -1;
+        }
+        status = deflateEnd(&zs);
+        if (status != Z_OK) {
+            report_error(fp, "deflate end failed");
+            return -1;
+        }
+        compressed_length = zs.total_out;
+        compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+        if (compressed_length > MAX_BLOCK_SIZE) {
+            // should never happen
+            report_error(fp, "deflate overflow");
+            return -1;
+        }
+        break;
+    }
+
+    packInt16((uint8_t*)&buffer[16], compressed_length-1);
+    uint32_t crc = crc32(0L, NULL, 0L);
+    crc = crc32(crc, fp->uncompressed_block, input_length);
+    packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+    packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+    int remaining = block_length - input_length;
+    if (remaining > 0) {
+        if (remaining > input_length) {
+            // should never happen (check so we can use memcpy)
+            report_error(fp, "remainder too large");
+            return -1;
+        }
+        memcpy(fp->uncompressed_block,
+               fp->uncompressed_block + input_length,
+               remaining);
+    }
+    fp->block_offset = remaining;
+    return compressed_length;
+}
+
+static
+int
+inflate_block(BGZF* fp, int block_length)
+{
+    // Inflate the block in fp->compressed_block into fp->uncompressed_block
+
+    z_stream zs;
+    zs.zalloc = NULL;
+    zs.zfree = NULL;
+    zs.next_in = fp->compressed_block + 18;
+    zs.avail_in = block_length - 16;
+    zs.next_out = fp->uncompressed_block;
+    zs.avail_out = fp->uncompressed_block_size;
+
+    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+    if (status != Z_OK) {
+        report_error(fp, "inflate init failed");
+        return -1;
+    }
+    status = inflate(&zs, Z_FINISH);
+    if (status != Z_STREAM_END) {
+        inflateEnd(&zs);
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    status = inflateEnd(&zs);
+    if (status != Z_OK) {
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    return zs.total_out;
+}
+
+static
+int
+check_header(const byte* header)
+{
+    return (header[0] == GZIP_ID1 &&
+            header[1] == (byte) GZIP_ID2 &&
+            header[2] == Z_DEFLATED &&
+            (header[3] & FLG_FEXTRA) != 0 &&
+            unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
+            header[12] == BGZF_ID1 &&
+            header[13] == BGZF_ID2 &&
+            unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+}
+
+static
+int
+read_block(BGZF* fp)
+{
+    byte header[BLOCK_HEADER_LENGTH];
+    int64_t block_address = ftello(fp->file);
+    int count = fread(header, 1, sizeof(header), fp->file);
+    if (count == 0) {
+        fp->block_length = 0;
+        return 0;
+    }
+    if (count != sizeof(header)) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+    if (!check_header(header)) {
+        report_error(fp, "invalid block header");
+        return -1;
+    }
+    int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+    byte* compressed_block = (byte*) fp->compressed_block;
+    memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+    int remaining = block_length - BLOCK_HEADER_LENGTH;
+    count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
+    if (count != remaining) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+    count = inflate_block(fp, block_length);
+    if (count < 0) {
+        return -1;
+    }
+    if (fp->block_length != 0) {
+        // Do not reset offset if this read follows a seek.
+        fp->block_offset = 0;
+    }
+    fp->block_address = block_address;
+    fp->block_length = count;
+    return 0;
+}
+
+int
+bgzf_read(BGZF* fp, void* data, int length)
+{
+    if (length <= 0) {
+        return 0;
+    }
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for reading");
+        return -1;
+    }
+
+    int bytes_read = 0;
+    byte* output = data;
+    while (bytes_read < length) {
+        int available = fp->block_length - fp->block_offset;
+        if (available <= 0) {
+            if (read_block(fp) != 0) {
+                return -1;
+            }
+            available = fp->block_length - fp->block_offset;
+            if (available <= 0) {
+                break;
+            }
+        }
+        int copy_length = min(length-bytes_read, available);
+        byte* buffer = fp->uncompressed_block;
+        memcpy(output, buffer + fp->block_offset, copy_length);
+        fp->block_offset += copy_length;
+        output += copy_length;
+        bytes_read += copy_length;
+    }
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = ftello(fp->file);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+    return bytes_read;
+}
+
+static
+int
+flush_block(BGZF* fp)
+{
+    while (fp->block_offset > 0) {
+        int block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) {
+            return -1;
+        }
+        int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+        if (count != block_length) {
+            report_error(fp, "write failed");
+            return -1;
+        }
+        fp->block_address += block_length;
+    }
+    return 0;
+}
+
+int
+bgzf_write(BGZF* fp, const void* data, int length)
+{
+    if (fp->open_mode != 'w') {
+        report_error(fp, "file not open for writing");
+        return -1;
+    }
+
+    if (fp->uncompressed_block == NULL) {
+        fp->uncompressed_block = malloc(fp->uncompressed_block_size);
+    }
+
+    const byte* input = data;
+    int block_length = fp->uncompressed_block_size;
+    int bytes_written = 0;
+    while (bytes_written < length) {
+        int copy_length = min(block_length - fp->block_offset, length - bytes_written);
+        byte* buffer = fp->uncompressed_block;
+        memcpy(buffer + fp->block_offset, input, copy_length);
+        fp->block_offset += copy_length;
+        input += copy_length;
+        bytes_written += copy_length;
+        if (fp->block_offset == block_length) {
+            if (flush_block(fp) != 0) {
+                break;
+            }
+        }
+    }
+    return bytes_written;
+}
+
+int
+bgzf_close(BGZF* fp)
+{
+    if (fp->open_mode == 'w') {
+        if (flush_block(fp) != 0) {
+            return -1;
+        }
+        if (fflush(fp->file) != 0) {
+            report_error(fp, "flush failed");
+            return -1;
+        }
+    }
+    if (fp->owned_file) {
+        if (fclose(fp->file) != 0) {
+            return -1;
+        }
+    }
+    free(fp->uncompressed_block);
+    free(fp->compressed_block);
+    free(fp);
+    return 0;
+}
+
+int64_t
+bgzf_tell(BGZF* fp)
+{
+    return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
+}
+
+int64_t
+bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for read");
+        return -1;
+    }
+    if (where != SEEK_SET) {
+        report_error(fp, "unimplemented seek option");
+        return -1;
+    }
+    int block_offset = pos & 0xFFFF;
+    int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+    if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+        report_error(fp, "seek failed");
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block is not loaded
+    fp->block_address = block_address;
+    fp->block_offset = block_offset;
+    return 0;
+}
+
diff --git a/bgzf.h b/bgzf.h
new file mode 100644 (file)
index 0000000..7e76bcb
--- /dev/null
+++ b/bgzf.h
@@ -0,0 +1,102 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+#include <stdbool.h>
+//#include "zutil.h"
+
+//typedef int8_t bool;
+
+typedef struct {
+    int file_descriptor;
+    char open_mode;  // 'r' or 'w'
+    bool owned_file;
+    FILE* file;
+    int uncompressed_block_size;
+    int compressed_block_size;
+    void* uncompressed_block;
+    void* compressed_block;
+    int64_t block_address;
+    int block_length;
+    int block_offset;
+    const char* error;
+} BGZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Open an existing file descriptor for reading or writing.
+ * Mode must be either "r" or "w".
+ * A subsequent bgzf_close will not close the file descriptor.
+ * Returns null on error.
+ */
+BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
+
+/*
+ * Open the specified file for reading or writing.
+ * Mode must be either "r" or "w".
+ * Returns null on error.
+ */
+BGZF* bgzf_open(const char* path, const char* __restrict mode);
+
+/*
+ * Close the BGZ file and free all associated resources.
+ * Does not close the underlying file descriptor if created with bgzf_fdopen.
+ * Returns zero on success, -1 on error.
+ */
+int bgzf_close(BGZF* fp);
+
+/*
+ * Read up to length bytes from the file storing into data.
+ * Returns the number of bytes actually read.
+ * Returns zero on end of file.
+ * Returns -1 on error.
+ */
+int bgzf_read(BGZF* fp, void* data, int length);
+
+/*
+ * Write length bytes from data to the file.
+ * Returns the number of bytes written.
+ * Returns -1 on error.
+ */
+int bgzf_write(BGZF* fp, const void* data, int length);
+
+/*
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ * Returns -1 on error.
+ */
+int64_t bgzf_tell(BGZF* fp);
+
+/*
+ * Set the file to read from the location specified by pos, which must
+ * be a value previously returned by bgzf_tell for this file (but not
+ * necessarily one returned by this file handle).
+ * The where argument must be SEEK_SET.
+ * Seeking on a file opened for write is not supported.
+ * Returns zero on success, -1 on error.
+ */
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif