]> git.donarmstrong.com Git - bamtools.git/blobdiff - BamReader.h
Full update to SVN after combining BamReader and BamWriter into cohesive BamTools...
[bamtools.git] / BamReader.h
index 17cb86ab5ad09421230cb6cf34741342082562cb..d829891e45dc5a11f8b83d61a2e2403330e86be1 100644 (file)
 // ***************************************************************************
-// BamReader (c) 2009 Derek Barnett
-// Marth Lab, Deptartment of Biology, Boston College
+// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg
+// Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
 // Provides the basic functionality for reading BAM files
 // ***************************************************************************
 
-#pragma once
+/*! 
+       \file BamReader.h
+       \brief API for reading BAM files.
+*/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <zlib.h>
+#pragma once
 
+// C++ includes
 #include <algorithm>
+#include <iterator>
 #include <string>
 #include <utility>
 #include <vector>
-using namespace std;
-
-#include "BamAlignment.h"
-
-// our zlib constants
-#define GZIP_ID1             31
-#define GZIP_ID2            139
-#define CM_DEFLATE            8
-#define FLG_FEXTRA            4
-#define OS_UNKNOWN          255
-#define BGZF_XLEN             6
-#define BGZF_ID1             66
-#define BGZF_ID2             67
-#define BGZF_LEN              2
-#define GZIP_WINDOW_BITS    -15
-#define Z_DEFAULT_MEM_LEVEL   8
-
-// our BZGF constants
-#define BLOCK_HEADER_LENGTH    18
-#define BLOCK_FOOTER_LENGTH     8
-#define MAX_BLOCK_SIZE      65536
-#define DEFAULT_BLOCK_SIZE  65536
 
-// our BAM constants
-#define BAM_CORE_SIZE  32
-#define BAM_CMATCH      0
-#define BAM_CINS        1
-#define BAM_CDEL        2
-#define BAM_CREF_SKIP   3
-#define BAM_CSOFT_CLIP  4
-#define BAM_CHARD_CLIP  5
-#define BAM_CPAD        6
-#define BAM_CIGAR_SHIFT 4
-#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+// zlib includes
+#include <zlib.h>
 
-// BAM indexing constants
-#define MAX_BIN           37450 // =(8^6-1)/7+1
-#define BAM_MIN_CHUNK_GAP 32768 
-#define BAM_LIDX_SHIFT    14
+// BamTools includes
+#include "BamAux.h"
 
-// our variable sizes
-#define SIZEOF_INT 4
+namespace BamTools { 
 
-// define our BZGF structure
-#ifndef BGZF_DATA
-#define BGZF_DATA
-struct BgzfData {
-       unsigned int UncompressedBlockSize;
-       unsigned int CompressedBlockSize;
-       unsigned int BlockLength;
-       unsigned int BlockOffset;
-       uint64_t BlockAddress;
-       bool IsOpen;
-       FILE* Stream;
-       char* UncompressedBlock;
-       char* CompressedBlock;
+       //! API for reading BAM files.
+       class BamReader {
+               
+               public:
+                       
+                       //! Constructor
+                       BamReader(void);
+                       
+                       //! Destructor
+                       ~BamReader(void);
+               
+               public:
+                       
+                       /*! 
+                               \brief Closes the BAM file.
+                               
+                               Also closes index file and clears index data, if provided.
+                               
+                               \sa Open()
+                       */
+                       void Close(void);
+                       
+                       /*! 
+                               \brief Access SAM format header data.
+                               
+                               See SAM format documentation for detailed header description.
+                               
+                               \return Full header text (no parsing of tags)
+                       */
+                       const std::string GetHeaderText(void) const;
+                       
+                       /*! 
+                               \brief Retrieve next alignment.
+                               
+                               Stores result in bAlignment.  
+                               
+                               If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either: 
+                               a) overlaps, or 
+                               b) begins on/after that specified position.
+                               
+                               Otherwise this simply stores next alignment, if one exists.
+                               
+                               Note that this method does not specifiy a 'right bound' position.  
+                               If a range is desired, you should supply some stopping criteria. For example:
+                               
+                               \code
+                               BamReader bReader;
+                               bReader.Open(bamFile, bamIndexfile);
+                               if ( bReader.Jump( someID, somePosition ) {
+                                       BamAlignment bAlignment;
+                                       while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) {
+                                               // do something
+                                       }
+                               }
+                               \endcode
+                               
+                               \param bAlignment destination for alignment data
+                               \return success/failure
+                               \sa Jump(), Rewind()
+                       */
+                       bool GetNextAlignment(BamAlignment& bAlignment);
+                       
+                       /*! 
+                               \brief Get number of reference sequences in BAM file.
+                               \return Number of references
+                               \sa GetReferenceData(), GetReferenceID()
+                       */
+                       const int GetReferenceCount(void) const;
+                       
+                       /*! 
+                               \brief Access reference data.
+                               \return Vector of RefData entry
+                               \sa GetReferenceCount(), GetReferenceID()
+                       */
+                       const RefVector GetReferenceData(void) const;
+                       
+                       /*! 
+                               \brief Get reference ID from name.
+                               \param refName name of reference sequence
+                               \return reference ID number
+                               \sa GetReferenceCount(), GetReferenceData()
+                       */
+                       const int GetReferenceID(const std::string& refName) const;
+                       
+                       /*! 
+                               \brief Random access in BAM file.
+                               
+                               Jump to a specified position on reference sequence. 
+                               Position is optional - defaults to beginning of specified reference.
+                               
+                               Reference and position are stored for use by subsequent calls to GetNextAlignment().
+                               
+                               \param refID ID number of desired reference
+                               \param position left-bound position  
+                               \return success/failure
+                               \sa GetNextAlignment(), Rewind()
+                       */
+                       bool Jump(int refID, unsigned int position = 0);
+                       
+                       /*! 
+                               \brief Opens a BAM file.
+                               
+                               Index file is optional - sequential reading through a BAM file does not require an index.
+                               
+                               However, the index is required to perform random-access alignment retrival (using the Jump() method ).  
+                               
+                               See SAMtools documentation for BAM index generation.
+                               
+                               \param filename BAM file
+                               \param indexFilename BAM index file
+                               \sa Jump(), Close()
+                       */
+                       void Open(const std::string& filename, const std::string& indexFilename = "");
+                       
+                       /*!
+                               \brief Moves file pointer to beginning of alignment data.
+                               
+                               A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file.
+                               Clears any reference and position set by a prior call to Jump()
+                               
+                               \return success/failure
+                               \sa GetNextAlignment(), Jump()
+                       */
+                       bool Rewind(void);
+               
+               // --------------------------------------------------------------------------------------
+               // internal methods
+               private:
+                       // checks BGZF block header
+                       bool BgzfCheckBlockHeader(char* header);
+                       // closes the BAM file
+                       void BgzfClose(void);
+                       // de-compresses the current block
+                       int BgzfInflateBlock(int blockLength);
+                       // opens the BAM file for reading
+                       void BgzfOpen(const std::string& filename);
+                       // reads BGZF data into a byte buffer
+                       unsigned int BgzfRead(char* data, const unsigned int dataLen);
+                       // reads BGZF block
+                       int BgzfReadBlock(void);
+                       // seek to position in BAM file
+                       bool BgzfSeek(int64_t position);
+                       // get file position in BAM file
+                       int64_t BgzfTell(void);
+                       // unpacks a buffer into an unsigned int
+                       static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
+                       // unpacks a buffer into an unsigned short
+                       static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
+                       // calculate bins that overlap region ( left to reference end for now )
+                       int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
+                       // calculates alignment end position based on starting position and provided CIGAR operations
+                       unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);
+                       // clear out (delete pointers in) index data structure
+                       void ClearIndex(void);
+                       // calculate file offset for first alignment chunk overlapping 'left'
+                       int64_t GetOffset(int refID, unsigned int left);
+                       // checks to see if alignment overlaps current region
+                       bool IsOverlap(BamAlignment& bAlignment);
+                       // retrieves header text from BAM file
+                       void LoadHeaderData(void);
+                       // builds BamIndex data structure from BAM index file
+                       void LoadIndexData(FILE* indexStream);
+                       // retrieves BAM alignment under file pointer
+                       bool LoadNextAlignment(BamAlignment& bAlignment);
+                       // builds reference data structure from BAM file
+                       void LoadReferenceData(void);
+                       // open BAM index file (if successful, loads index)
+                       void OpenIndex(const std::string& indexFilename);
+               
+               // aligment file & index file data
+               private:
+                       BgzfData    m_BGZF;
+                       std::string m_headerText;
+                       BamIndex*   m_index;
+                       RefVector   m_references;
+                       bool        m_isIndexLoaded;
+                       int64_t     m_alignmentsBeginOffset;
+               
+               // user-specified region values
+               private:
+                       bool         m_isRegionSpecified;
+                       int          m_currentRefID;
+                       unsigned int m_currentLeft;
+               
+               // BAM character constants
+               private:
+                       static const char* DNA_LOOKUP;
+                       static const char* CIGAR_LOOKUP;
+       };
 
-       // constructor
-       BgzfData(void)
-               : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
-               , CompressedBlockSize(MAX_BLOCK_SIZE)
-               , BlockLength(0)
-               , BlockOffset(0)
-               , BlockAddress(0)
-               , IsOpen(false)
-               , Stream(NULL)
-               , UncompressedBlock(NULL)
-               , CompressedBlock(NULL)
-       {
-               try {
-                       CompressedBlock   = new char[CompressedBlockSize];
-                       UncompressedBlock = new char[UncompressedBlockSize];
-               } catch(bad_alloc&) {
-                       printf("ERROR: Unable to allocate memory for our BGZF object.\n");
-                       exit(1);
-               }
+       //! \cond
+       // --------------------------------------------------------------------------------------
+       // static inline methods (internal - can exclude from main documentation)
+       
+       // unpacks a buffer into an unsigned int
+       inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
+               union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+               un.valueBuffer[0] = buffer[0];
+               un.valueBuffer[1] = buffer[1];
+               un.valueBuffer[2] = buffer[2];
+               un.valueBuffer[3] = buffer[3];
+               return un.value;
        }
 
-       // destructor
-       ~BgzfData(void) {
-               if(CompressedBlock)   delete [] CompressedBlock;
-               if(UncompressedBlock) delete [] UncompressedBlock;
+       // unpacks a buffer into an unsigned short
+       inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
+               union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
+               un.valueBuffer[0] = buffer[0];
+               un.valueBuffer[1] = buffer[1];
+               return un.value;
        }
-};
-#endif // BGZF_DATA
-
-
-// --------------------------- //
-// BamIndex-related typedefs
-// --------------------------- //
-
-// offset for linear indexing
-typedef vector<uint64_t> LinearOffsetVector;
-
-// alignment 'chunk' boundaries
-typedef pair<uint64_t, uint64_t> ChunkPair;
-typedef vector<ChunkPair> ChunkVector;
-
-// BAM bins.. each contains (binID, alignment 'chunks')
-typedef pair<uint32_t, ChunkVector*> BamBin;
-typedef vector<BamBin> BinVector;
 
-// each reference sequence has a BinVector and LinearOffsetVector
-typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
-
-// full BamIndex defined as: 
-typedef vector<RefIndex*> BamIndex;
-
-class BamReader {
-
-       // constructor/destructor
-       public:
-               BamReader(void);
-               ~BamReader(void);
-
-       // public interface
-       public:
-               // closes the BAM file
-               void Close(void);
-               // retrieves header text
-               const string GetHeaderText(void) const;
-               // saves the alignment to the alignment archive
-               bool GetNextAlignment(BamAlignment& bAlignment);
-               // return number of reference sequences in BAM file
-               const int GetReferenceCount(void) const;
-               // return vector of RefData entries
-               const RefVector GetReferenceData(void) const;
-               // get refID from reference name
-               const int GetReferenceID(const string& refName) const;
-               // jumps to 'left' position on refID
-               bool Jump(int refID, unsigned int left = 0);
-               // opens the BAM file
-               void Open(const string& filename, const string& indexFilename = "");
-               // move file pointer back to first alignment
-               bool Rewind(void);
-
-       // internal methods
-       private:
-               // checks BGZF block header
-               bool BgzfCheckBlockHeader(char* header);
-               // closes the BAM file
-               void BgzfClose(void);
-               // de-compresses the current block
-               int BgzfInflateBlock(int blockLength);
-               // opens the BAM file for reading
-               void BgzfOpen(const string& filename);
-               // reads BGZF data into a byte buffer
-               unsigned int BgzfRead(char* data, const unsigned int dataLen);
-               // reads BGZF block
-               int BgzfReadBlock(void);
-               // seek to position in BAM file
-               bool BgzfSeek(int64_t position);
-               // get file position in BAM file
-               int64_t BgzfTell(void);
-               // unpacks a buffer into an unsigned int
-               static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
-               // unpacks a buffer into an unsigned short
-               static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
-               //      calculate bins that overlap region ( left to reference end for now )
-               int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
-               // calculates alignment end position based on starting position and provided CIGAR operations
-               unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
-               // clear out (delete pointers in) index data structure
-               void ClearIndex(void);
-               // calculate file offset for first alignment chunk overlapping 'left'
-               int64_t GetOffset(int refID, unsigned int left);
-               // checks to see if alignment overlaps current region
-               bool IsOverlap(BamAlignment& bAlignment);
-               // retrieves header text from BAM file
-               void LoadHeaderData(void);
-               // builds BamIndex data structure from BAM index file
-               void LoadIndexData(FILE* indexStream);
-               // retrieves BAM alignment under file pointer
-               bool LoadNextAlignment(BamAlignment& bAlignment);
-               // builds reference data structure from BAM file
-               void LoadReferenceData(void);
-               // open BAM index file (if successful, loads index)
-               void OpenIndex(const string& indexFilename);
+       // --------------------------------------------------------------------------------------
+       // template classes/methods (internal - can exclude from main documentation)
        
-       // aligment file & index file data
-       private:
-               BgzfData  m_BGZF;
-               string    m_headerText;
-               BamIndex* m_index;
-               RefVector m_references;
-               bool      m_isIndexLoaded;
-               int64_t   m_alignmentsBeginOffset;
-
-       // user-specified region values
-       private:
-               bool         m_isRegionSpecified;
-               int          m_currentRefID;
-               unsigned int m_currentLeft;
-
-       // BAM character constants
-       private:
-               static const char* DNA_LOOKUP;
-               static const char* CIGAR_LOOKUP;
-};
+       // allows sorting/searching of a vector of pairs (instead of using maps)
+       template <typename Key, typename Value>
+       class LookupKeyCompare {
 
-// unpacks a buffer into an unsigned int
-inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
-       union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
-       un.valueBuffer[0] = buffer[0];
-       un.valueBuffer[1] = buffer[1];
-       un.valueBuffer[2] = buffer[2];
-       un.valueBuffer[3] = buffer[3];
-       return un.value;
-}
-
-// unpacks a buffer into an unsigned short
-inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
-       union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
-       un.valueBuffer[0] = buffer[0];
-       un.valueBuffer[1] = buffer[1];
-       return un.value;
-}
-
-// allows sorting/searching of a vector of pairs (instead of using maps)
-template <typename Key, typename Value>
-class LookupKeyCompare {
+               typedef std::pair< Key, Value > LookupData;
+               typedef typename LookupData::first_type Key_t;
+               
+               public:
+                       bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
+                       bool operator() (const LookupData& lhs, const Key_t& k) const        { return keyLess(lhs.first, k); }
+                       bool operator() (const Key_t& k, const LookupData& rhs) const        { return keyLess(k, rhs.first); }
+               private:
+                       bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
+       };
 
-       typedef pair< Key, Value > LookupData;
-       typedef typename LookupData::first_type Key_t;
+       // return index of item if found, else return container.size()
+       template < typename InputIterator, typename EqualityComparable >
+       typename std::iterator_traits<InputIterator>::difference_type
+       Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
+               return std::distance(begin, std::find(begin, end, item));
+       }
+       //! \endcond
        
-       public:
-               bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
-               bool operator() (const LookupData& lhs, const Key_t& k) const        { return keyLess(lhs.first, k); }
-               bool operator() (const Key_t& k, const LookupData& rhs) const        { return keyLess(k, rhs.first); }
-       private:
-               bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
-};
-
-// return index of item if found, else return container.size()
-template < typename InputIterator, typename EqualityComparable >
-typename iterator_traits<InputIterator>::difference_type
-Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
-       return distance(begin, find(begin, end, item));
 }
-