// ***************************************************************************
-// BamReader (c) 2009 Derek Barnett
-// Marth Lab, Deptartment of Biology, Boston College
+// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg
+// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
// Provides the basic functionality for reading BAM files
// ***************************************************************************
-#pragma once
+/*!
+ \file BamReader.h
+ \brief API for reading BAM files.
+*/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <zlib.h>
+#pragma once
+// C++ includes
#include <algorithm>
+#include <iterator>
#include <string>
#include <utility>
#include <vector>
-using namespace std;
-
-#include "BamAlignment.h"
-
-// our zlib constants
-#define GZIP_ID1 31
-#define GZIP_ID2 139
-#define CM_DEFLATE 8
-#define FLG_FEXTRA 4
-#define OS_UNKNOWN 255
-#define BGZF_XLEN 6
-#define BGZF_ID1 66
-#define BGZF_ID2 67
-#define BGZF_LEN 2
-#define GZIP_WINDOW_BITS -15
-#define Z_DEFAULT_MEM_LEVEL 8
-
-// our BZGF constants
-#define BLOCK_HEADER_LENGTH 18
-#define BLOCK_FOOTER_LENGTH 8
-#define MAX_BLOCK_SIZE 65536
-#define DEFAULT_BLOCK_SIZE 65536
-// our BAM constants
-#define BAM_CORE_SIZE 32
-#define BAM_CMATCH 0
-#define BAM_CINS 1
-#define BAM_CDEL 2
-#define BAM_CREF_SKIP 3
-#define BAM_CSOFT_CLIP 4
-#define BAM_CHARD_CLIP 5
-#define BAM_CPAD 6
-#define BAM_CIGAR_SHIFT 4
-#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+// zlib includes
+#include <zlib.h>
-// BAM indexing constants
-#define MAX_BIN 37450 // =(8^6-1)/7+1
-#define BAM_MIN_CHUNK_GAP 32768
-#define BAM_LIDX_SHIFT 14
+// BamTools includes
+#include "BamAux.h"
-// our variable sizes
-#define SIZEOF_INT 4
+namespace BamTools {
-// define our BZGF structure
-#ifndef BGZF_DATA
-#define BGZF_DATA
-struct BgzfData {
- unsigned int UncompressedBlockSize;
- unsigned int CompressedBlockSize;
- unsigned int BlockLength;
- unsigned int BlockOffset;
- uint64_t BlockAddress;
- bool IsOpen;
- FILE* Stream;
- char* UncompressedBlock;
- char* CompressedBlock;
+ //! API for reading BAM files.
+ class BamReader {
+
+ public:
+
+ //! Constructor
+ BamReader(void);
+
+ //! Destructor
+ ~BamReader(void);
+
+ public:
+
+ /*!
+ \brief Closes the BAM file.
+
+ Also closes index file and clears index data, if provided.
+
+ \sa Open()
+ */
+ void Close(void);
+
+ /*!
+ \brief Access SAM format header data.
+
+ See SAM format documentation for detailed header description.
+
+ \return Full header text (no parsing of tags)
+ */
+ const std::string GetHeaderText(void) const;
+
+ /*!
+ \brief Retrieve next alignment.
+
+ Stores result in bAlignment.
+
+ If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either:
+ a) overlaps, or
+ b) begins on/after that specified position.
+
+ Otherwise this simply stores next alignment, if one exists.
+
+ Note that this method does not specifiy a 'right bound' position.
+ If a range is desired, you should supply some stopping criteria. For example:
+
+ \code
+ BamReader bReader;
+ bReader.Open(bamFile, bamIndexfile);
+ if ( bReader.Jump( someID, somePosition ) {
+ BamAlignment bAlignment;
+ while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) {
+ // do something
+ }
+ }
+ \endcode
+
+ \param bAlignment destination for alignment data
+ \return success/failure
+ \sa Jump(), Rewind()
+ */
+ bool GetNextAlignment(BamAlignment& bAlignment);
+
+ /*!
+ \brief Get number of reference sequences in BAM file.
+ \return Number of references
+ \sa GetReferenceData(), GetReferenceID()
+ */
+ const int GetReferenceCount(void) const;
+
+ /*!
+ \brief Access reference data.
+ \return Vector of RefData entry
+ \sa GetReferenceCount(), GetReferenceID()
+ */
+ const RefVector GetReferenceData(void) const;
+
+ /*!
+ \brief Get reference ID from name.
+ \param refName name of reference sequence
+ \return reference ID number
+ \sa GetReferenceCount(), GetReferenceData()
+ */
+ const int GetReferenceID(const std::string& refName) const;
+
+ /*!
+ \brief Random access in BAM file.
+
+ Jump to a specified position on reference sequence.
+ Position is optional - defaults to beginning of specified reference.
+
+ Reference and position are stored for use by subsequent calls to GetNextAlignment().
+
+ \param refID ID number of desired reference
+ \param position left-bound position
+ \return success/failure
+ \sa GetNextAlignment(), Rewind()
+ */
+ bool Jump(int refID, unsigned int position = 0);
+
+ /*!
+ \brief Opens a BAM file.
+
+ Index file is optional - sequential reading through a BAM file does not require an index.
+
+ However, the index is required to perform random-access alignment retrival (using the Jump() method ).
+
+ See SAMtools documentation for BAM index generation.
+
+ \param filename BAM file
+ \param indexFilename BAM index file
+ \sa Jump(), Close()
+ */
+ void Open(const std::string& filename, const std::string& indexFilename = "");
+
+ /*!
+ \brief Moves file pointer to beginning of alignment data.
+
+ A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file.
+ Clears any reference and position set by a prior call to Jump()
+
+ \return success/failure
+ \sa GetNextAlignment(), Jump()
+ */
+ bool Rewind(void);
+
+ // --------------------------------------------------------------------------------------
+ // internal methods
+ private:
+ // checks BGZF block header
+ bool BgzfCheckBlockHeader(char* header);
+ // closes the BAM file
+ void BgzfClose(void);
+ // de-compresses the current block
+ int BgzfInflateBlock(int blockLength);
+ // opens the BAM file for reading
+ void BgzfOpen(const std::string& filename);
+ // reads BGZF data into a byte buffer
+ unsigned int BgzfRead(char* data, const unsigned int dataLen);
+ // reads BGZF block
+ int BgzfReadBlock(void);
+ // seek to position in BAM file
+ bool BgzfSeek(int64_t position);
+ // get file position in BAM file
+ int64_t BgzfTell(void);
+ // unpacks a buffer into an unsigned int
+ static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
+ // unpacks a buffer into an unsigned short
+ static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
+ // calculate bins that overlap region ( left to reference end for now )
+ int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
+ // calculates alignment end position based on starting position and provided CIGAR operations
+ unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);
+ // clear out (delete pointers in) index data structure
+ void ClearIndex(void);
+ // calculate file offset for first alignment chunk overlapping 'left'
+ int64_t GetOffset(int refID, unsigned int left);
+ // checks to see if alignment overlaps current region
+ bool IsOverlap(BamAlignment& bAlignment);
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // builds BamIndex data structure from BAM index file
+ void LoadIndexData(FILE* indexStream);
+ // retrieves BAM alignment under file pointer
+ bool LoadNextAlignment(BamAlignment& bAlignment);
+ // builds reference data structure from BAM file
+ void LoadReferenceData(void);
+ // open BAM index file (if successful, loads index)
+ void OpenIndex(const std::string& indexFilename);
+
+ // aligment file & index file data
+ private:
+ BgzfData m_BGZF;
+ std::string m_headerText;
+ BamIndex* m_index;
+ RefVector m_references;
+ bool m_isIndexLoaded;
+ int64_t m_alignmentsBeginOffset;
+
+ // user-specified region values
+ private:
+ bool m_isRegionSpecified;
+ int m_currentRefID;
+ unsigned int m_currentLeft;
+
+ // BAM character constants
+ private:
+ static const char* DNA_LOOKUP;
+ static const char* CIGAR_LOOKUP;
+ };
- // constructor
- BgzfData(void)
- : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
- , CompressedBlockSize(MAX_BLOCK_SIZE)
- , BlockLength(0)
- , BlockOffset(0)
- , BlockAddress(0)
- , IsOpen(false)
- , Stream(NULL)
- , UncompressedBlock(NULL)
- , CompressedBlock(NULL)
- {
- try {
- CompressedBlock = new char[CompressedBlockSize];
- UncompressedBlock = new char[UncompressedBlockSize];
- } catch(bad_alloc&) {
- printf("ERROR: Unable to allocate memory for our BGZF object.\n");
- exit(1);
- }
+ //! \cond
+ // --------------------------------------------------------------------------------------
+ // static inline methods (internal - can exclude from main documentation)
+
+ // unpacks a buffer into an unsigned int
+ inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
}
- // destructor
- ~BgzfData(void) {
- if(CompressedBlock) delete [] CompressedBlock;
- if(UncompressedBlock) delete [] UncompressedBlock;
+ // unpacks a buffer into an unsigned short
+ inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
}
-};
-#endif // BGZF_DATA
-
-
-// --------------------------- //
-// BamIndex-related typedefs
-// --------------------------- //
-
-// offset for linear indexing
-typedef vector<uint64_t> LinearOffsetVector;
-
-// alignment 'chunk' boundaries
-typedef pair<uint64_t, uint64_t> ChunkPair;
-typedef vector<ChunkPair> ChunkVector;
-
-// BAM bins.. each contains (binID, alignment 'chunks')
-typedef pair<uint32_t, ChunkVector*> BamBin;
-typedef vector<BamBin> BinVector;
-// each reference sequence has a BinVector and LinearOffsetVector
-typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
-
-// full BamIndex defined as:
-typedef vector<RefIndex*> BamIndex;
-
-class BamReader {
-
- // constructor/destructor
- public:
- BamReader(void);
- ~BamReader(void);
-
- // public interface
- public:
- // closes the BAM file
- void Close(void);
- // retrieves header text
- const string GetHeaderText(void) const;
- // saves the alignment to the alignment archive
- bool GetNextAlignment(BamAlignment& bAlignment);
- // return number of reference sequences in BAM file
- const int GetReferenceCount(void) const;
- // return vector of RefData entries
- const RefVector GetReferenceData(void) const;
- // get refID from reference name
- const int GetReferenceID(const string& refName) const;
- // jumps to 'left' position on refID
- bool Jump(int refID, unsigned int left = 0);
- // opens the BAM file
- void Open(const string& filename, const string& indexFilename = "");
- // move file pointer back to first alignment
- bool Rewind(void);
-
- // internal methods
- private:
- // checks BGZF block header
- bool BgzfCheckBlockHeader(char* header);
- // closes the BAM file
- void BgzfClose(void);
- // de-compresses the current block
- int BgzfInflateBlock(int blockLength);
- // opens the BAM file for reading
- void BgzfOpen(const string& filename);
- // reads BGZF data into a byte buffer
- unsigned int BgzfRead(char* data, const unsigned int dataLen);
- // reads BGZF block
- int BgzfReadBlock(void);
- // seek to position in BAM file
- bool BgzfSeek(int64_t position);
- // get file position in BAM file
- int64_t BgzfTell(void);
- // unpacks a buffer into an unsigned int
- static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
- // unpacks a buffer into an unsigned short
- static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
- // calculate bins that overlap region ( left to reference end for now )
- int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
- // calculates alignment end position based on starting position and provided CIGAR operations
- unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
- // clear out (delete pointers in) index data structure
- void ClearIndex(void);
- // calculate file offset for first alignment chunk overlapping 'left'
- int64_t GetOffset(int refID, unsigned int left);
- // checks to see if alignment overlaps current region
- bool IsOverlap(BamAlignment& bAlignment);
- // retrieves header text from BAM file
- void LoadHeaderData(void);
- // builds BamIndex data structure from BAM index file
- void LoadIndexData(FILE* indexStream);
- // retrieves BAM alignment under file pointer
- bool LoadNextAlignment(BamAlignment& bAlignment);
- // builds reference data structure from BAM file
- void LoadReferenceData(void);
- // open BAM index file (if successful, loads index)
- void OpenIndex(const string& indexFilename);
+ // --------------------------------------------------------------------------------------
+ // template classes/methods (internal - can exclude from main documentation)
- // aligment file & index file data
- private:
- BgzfData m_BGZF;
- string m_headerText;
- BamIndex* m_index;
- RefVector m_references;
- bool m_isIndexLoaded;
- int64_t m_alignmentsBeginOffset;
-
- // user-specified region values
- private:
- bool m_isRegionSpecified;
- int m_currentRefID;
- unsigned int m_currentLeft;
-
- // BAM character constants
- private:
- static const char* DNA_LOOKUP;
- static const char* CIGAR_LOOKUP;
-};
+ // allows sorting/searching of a vector of pairs (instead of using maps)
+ template <typename Key, typename Value>
+ class LookupKeyCompare {
-// unpacks a buffer into an unsigned int
-inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
- union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
- un.valueBuffer[0] = buffer[0];
- un.valueBuffer[1] = buffer[1];
- un.valueBuffer[2] = buffer[2];
- un.valueBuffer[3] = buffer[3];
- return un.value;
-}
-
-// unpacks a buffer into an unsigned short
-inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
- union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
- un.valueBuffer[0] = buffer[0];
- un.valueBuffer[1] = buffer[1];
- return un.value;
-}
-
-// allows sorting/searching of a vector of pairs (instead of using maps)
-template <typename Key, typename Value>
-class LookupKeyCompare {
+ typedef std::pair< Key, Value > LookupData;
+ typedef typename LookupData::first_type Key_t;
+
+ public:
+ bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
+ bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
+ bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
+ private:
+ bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
+ };
- typedef pair< Key, Value > LookupData;
- typedef typename LookupData::first_type Key_t;
+ // return index of item if found, else return container.size()
+ template < typename InputIterator, typename EqualityComparable >
+ typename std::iterator_traits<InputIterator>::difference_type
+ Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
+ return std::distance(begin, std::find(begin, end, item));
+ }
+ //! \endcond
- public:
- bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
- bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
- bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
- private:
- bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
-};
-
-// return index of item if found, else return container.size()
-template < typename InputIterator, typename EqualityComparable >
-typename iterator_traits<InputIterator>::difference_type
-Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
- return distance(begin, find(begin, end, item));
}
-