]> git.donarmstrong.com Git - bamtools.git/blob - BamReader.h
Full update to SVN after combining BamReader and BamWriter into cohesive BamTools...
[bamtools.git] / BamReader.h
1 // ***************************************************************************
2 // BamReader.h (c) 2009 Derek Barnett, Michael Strömberg
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 24 June 2009 (DB)
7 // ---------------------------------------------------------------------------
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
9 // Institute.
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for reading BAM files
12 // ***************************************************************************
13
14 /*! 
15         \file BamReader.h
16         \brief API for reading BAM files.
17 */
18
19 #pragma once
20
21 // C++ includes
22 #include <algorithm>
23 #include <iterator>
24 #include <string>
25 #include <utility>
26 #include <vector>
27
28 // zlib includes
29 #include <zlib.h>
30
31 // BamTools includes
32 #include "BamAux.h"
33
34 namespace BamTools { 
35
36         //! API for reading BAM files.
37         class BamReader {
38                 
39                 public:
40                         
41                         //! Constructor
42                         BamReader(void);
43                         
44                         //! Destructor
45                         ~BamReader(void);
46                 
47                 public:
48                         
49                         /*! 
50                                 \brief Closes the BAM file.
51                                 
52                                 Also closes index file and clears index data, if provided.
53                                 
54                                 \sa Open()
55                         */
56                         void Close(void);
57                         
58                         /*! 
59                                 \brief Access SAM format header data.
60                                 
61                                 See SAM format documentation for detailed header description.
62                                 
63                                 \return Full header text (no parsing of tags)
64                         */
65                         const std::string GetHeaderText(void) const;
66                         
67                         /*! 
68                                 \brief Retrieve next alignment.
69                                 
70                                 Stores result in bAlignment.  
71                                 
72                                 If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either: 
73                                 a) overlaps, or 
74                                 b) begins on/after that specified position.
75                                 
76                                 Otherwise this simply stores next alignment, if one exists.
77                                 
78                                 Note that this method does not specifiy a 'right bound' position.  
79                                 If a range is desired, you should supply some stopping criteria. For example:
80                                 
81                                 \code
82                                 BamReader bReader;
83                                 bReader.Open(bamFile, bamIndexfile);
84                                 if ( bReader.Jump( someID, somePosition ) {
85                                         BamAlignment bAlignment;
86                                         while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) {
87                                                 // do something
88                                         }
89                                 }
90                                 \endcode
91                                 
92                                 \param bAlignment destination for alignment data
93                                 \return success/failure
94                                 \sa Jump(), Rewind()
95                         */
96                         bool GetNextAlignment(BamAlignment& bAlignment);
97                         
98                         /*! 
99                                 \brief Get number of reference sequences in BAM file.
100                                 \return Number of references
101                                 \sa GetReferenceData(), GetReferenceID()
102                         */
103                         const int GetReferenceCount(void) const;
104                         
105                         /*! 
106                                 \brief Access reference data.
107                                 \return Vector of RefData entry
108                                 \sa GetReferenceCount(), GetReferenceID()
109                         */
110                         const RefVector GetReferenceData(void) const;
111                         
112                         /*! 
113                                 \brief Get reference ID from name.
114                                 \param refName name of reference sequence
115                                 \return reference ID number
116                                 \sa GetReferenceCount(), GetReferenceData()
117                         */
118                         const int GetReferenceID(const std::string& refName) const;
119                         
120                         /*! 
121                                 \brief Random access in BAM file.
122                                 
123                                 Jump to a specified position on reference sequence. 
124                                 Position is optional - defaults to beginning of specified reference.
125                                 
126                                 Reference and position are stored for use by subsequent calls to GetNextAlignment().
127                                 
128                                 \param refID ID number of desired reference
129                                 \param position left-bound position  
130                                 \return success/failure
131                                 \sa GetNextAlignment(), Rewind()
132                         */
133                         bool Jump(int refID, unsigned int position = 0);
134                         
135                         /*! 
136                                 \brief Opens a BAM file.
137                                 
138                                 Index file is optional - sequential reading through a BAM file does not require an index.
139                                 
140                                 However, the index is required to perform random-access alignment retrival (using the Jump() method ).  
141                                 
142                                 See SAMtools documentation for BAM index generation.
143                                 
144                                 \param filename BAM file
145                                 \param indexFilename BAM index file
146                                 \sa Jump(), Close()
147                         */
148                         void Open(const std::string& filename, const std::string& indexFilename = "");
149                         
150                         /*!
151                                 \brief Moves file pointer to beginning of alignment data.
152                                 
153                                 A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file.
154                                 Clears any reference and position set by a prior call to Jump()
155                                 
156                                 \return success/failure
157                                 \sa GetNextAlignment(), Jump()
158                         */
159                         bool Rewind(void);
160                 
161                 // --------------------------------------------------------------------------------------
162                 // internal methods
163                 private:
164                         // checks BGZF block header
165                         bool BgzfCheckBlockHeader(char* header);
166                         // closes the BAM file
167                         void BgzfClose(void);
168                         // de-compresses the current block
169                         int BgzfInflateBlock(int blockLength);
170                         // opens the BAM file for reading
171                         void BgzfOpen(const std::string& filename);
172                         // reads BGZF data into a byte buffer
173                         unsigned int BgzfRead(char* data, const unsigned int dataLen);
174                         // reads BGZF block
175                         int BgzfReadBlock(void);
176                         // seek to position in BAM file
177                         bool BgzfSeek(int64_t position);
178                         // get file position in BAM file
179                         int64_t BgzfTell(void);
180                         // unpacks a buffer into an unsigned int
181                         static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
182                         // unpacks a buffer into an unsigned short
183                         static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
184                         // calculate bins that overlap region ( left to reference end for now )
185                         int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
186                         // calculates alignment end position based on starting position and provided CIGAR operations
187                         unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);
188                         // clear out (delete pointers in) index data structure
189                         void ClearIndex(void);
190                         // calculate file offset for first alignment chunk overlapping 'left'
191                         int64_t GetOffset(int refID, unsigned int left);
192                         // checks to see if alignment overlaps current region
193                         bool IsOverlap(BamAlignment& bAlignment);
194                         // retrieves header text from BAM file
195                         void LoadHeaderData(void);
196                         // builds BamIndex data structure from BAM index file
197                         void LoadIndexData(FILE* indexStream);
198                         // retrieves BAM alignment under file pointer
199                         bool LoadNextAlignment(BamAlignment& bAlignment);
200                         // builds reference data structure from BAM file
201                         void LoadReferenceData(void);
202                         // open BAM index file (if successful, loads index)
203                         void OpenIndex(const std::string& indexFilename);
204                 
205                 // aligment file & index file data
206                 private:
207                         BgzfData    m_BGZF;
208                         std::string m_headerText;
209                         BamIndex*   m_index;
210                         RefVector   m_references;
211                         bool        m_isIndexLoaded;
212                         int64_t     m_alignmentsBeginOffset;
213                 
214                 // user-specified region values
215                 private:
216                         bool         m_isRegionSpecified;
217                         int          m_currentRefID;
218                         unsigned int m_currentLeft;
219                 
220                 // BAM character constants
221                 private:
222                         static const char* DNA_LOOKUP;
223                         static const char* CIGAR_LOOKUP;
224         };
225
226         //! \cond
227         // --------------------------------------------------------------------------------------
228         // static inline methods (internal - can exclude from main documentation)
229         
230         // unpacks a buffer into an unsigned int
231         inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
232                 union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
233                 un.valueBuffer[0] = buffer[0];
234                 un.valueBuffer[1] = buffer[1];
235                 un.valueBuffer[2] = buffer[2];
236                 un.valueBuffer[3] = buffer[3];
237                 return un.value;
238         }
239
240         // unpacks a buffer into an unsigned short
241         inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
242                 union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
243                 un.valueBuffer[0] = buffer[0];
244                 un.valueBuffer[1] = buffer[1];
245                 return un.value;
246         }
247
248         // --------------------------------------------------------------------------------------
249         // template classes/methods (internal - can exclude from main documentation)
250         
251         // allows sorting/searching of a vector of pairs (instead of using maps)
252         template <typename Key, typename Value>
253         class LookupKeyCompare {
254
255                 typedef std::pair< Key, Value > LookupData;
256                 typedef typename LookupData::first_type Key_t;
257                 
258                 public:
259                         bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
260                         bool operator() (const LookupData& lhs, const Key_t& k) const        { return keyLess(lhs.first, k); }
261                         bool operator() (const Key_t& k, const LookupData& rhs) const        { return keyLess(k, rhs.first); }
262                 private:
263                         bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
264         };
265
266         // return index of item if found, else return container.size()
267         template < typename InputIterator, typename EqualityComparable >
268         typename std::iterator_traits<InputIterator>::difference_type
269         Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
270                 return std::distance(begin, std::find(begin, end, item));
271         }
272         //! \endcond
273         
274 }