1 // ***************************************************************************
2 // BamReader.h (c) 2009 Derek Barnett, Michael Strömberg
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 24 June 2009 (DB)
7 // ---------------------------------------------------------------------------
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for reading BAM files
12 // ***************************************************************************
16 \brief API for reading BAM files.
36 //! API for reading BAM files.
50 \brief Closes the BAM file.
52 Also closes index file and clears index data, if provided.
59 \brief Access SAM format header data.
61 See SAM format documentation for detailed header description.
63 \return Full header text (no parsing of tags)
65 const std::string GetHeaderText(void) const;
68 \brief Retrieve next alignment.
70 Stores result in bAlignment.
72 If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either:
74 b) begins on/after that specified position.
76 Otherwise this simply stores next alignment, if one exists.
78 Note that this method does not specifiy a 'right bound' position.
79 If a range is desired, you should supply some stopping criteria. For example:
83 bReader.Open(bamFile, bamIndexfile);
84 if ( bReader.Jump( someID, somePosition ) {
85 BamAlignment bAlignment;
86 while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) {
92 \param bAlignment destination for alignment data
93 \return success/failure
96 bool GetNextAlignment(BamAlignment& bAlignment);
99 \brief Get number of reference sequences in BAM file.
100 \return Number of references
101 \sa GetReferenceData(), GetReferenceID()
103 const int GetReferenceCount(void) const;
106 \brief Access reference data.
107 \return Vector of RefData entry
108 \sa GetReferenceCount(), GetReferenceID()
110 const RefVector GetReferenceData(void) const;
113 \brief Get reference ID from name.
114 \param refName name of reference sequence
115 \return reference ID number
116 \sa GetReferenceCount(), GetReferenceData()
118 const int GetReferenceID(const std::string& refName) const;
121 \brief Random access in BAM file.
123 Jump to a specified position on reference sequence.
124 Position is optional - defaults to beginning of specified reference.
126 Reference and position are stored for use by subsequent calls to GetNextAlignment().
128 \param refID ID number of desired reference
129 \param position left-bound position
130 \return success/failure
131 \sa GetNextAlignment(), Rewind()
133 bool Jump(int refID, unsigned int position = 0);
136 \brief Opens a BAM file.
138 Index file is optional - sequential reading through a BAM file does not require an index.
140 However, the index is required to perform random-access alignment retrival (using the Jump() method ).
142 See SAMtools documentation for BAM index generation.
144 \param filename BAM file
145 \param indexFilename BAM index file
148 void Open(const std::string& filename, const std::string& indexFilename = "");
151 \brief Moves file pointer to beginning of alignment data.
153 A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file.
154 Clears any reference and position set by a prior call to Jump()
156 \return success/failure
157 \sa GetNextAlignment(), Jump()
161 // --------------------------------------------------------------------------------------
164 // checks BGZF block header
165 bool BgzfCheckBlockHeader(char* header);
166 // closes the BAM file
167 void BgzfClose(void);
168 // de-compresses the current block
169 int BgzfInflateBlock(int blockLength);
170 // opens the BAM file for reading
171 void BgzfOpen(const std::string& filename);
172 // reads BGZF data into a byte buffer
173 unsigned int BgzfRead(char* data, const unsigned int dataLen);
175 int BgzfReadBlock(void);
176 // seek to position in BAM file
177 bool BgzfSeek(int64_t position);
178 // get file position in BAM file
179 int64_t BgzfTell(void);
180 // unpacks a buffer into an unsigned int
181 static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
182 // unpacks a buffer into an unsigned short
183 static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
184 // calculate bins that overlap region ( left to reference end for now )
185 int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
186 // calculates alignment end position based on starting position and provided CIGAR operations
187 unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);
188 // clear out (delete pointers in) index data structure
189 void ClearIndex(void);
190 // calculate file offset for first alignment chunk overlapping 'left'
191 int64_t GetOffset(int refID, unsigned int left);
192 // checks to see if alignment overlaps current region
193 bool IsOverlap(BamAlignment& bAlignment);
194 // retrieves header text from BAM file
195 void LoadHeaderData(void);
196 // builds BamIndex data structure from BAM index file
197 void LoadIndexData(FILE* indexStream);
198 // retrieves BAM alignment under file pointer
199 bool LoadNextAlignment(BamAlignment& bAlignment);
200 // builds reference data structure from BAM file
201 void LoadReferenceData(void);
202 // open BAM index file (if successful, loads index)
203 void OpenIndex(const std::string& indexFilename);
205 // aligment file & index file data
208 std::string m_headerText;
210 RefVector m_references;
211 bool m_isIndexLoaded;
212 int64_t m_alignmentsBeginOffset;
214 // user-specified region values
216 bool m_isRegionSpecified;
218 unsigned int m_currentLeft;
220 // BAM character constants
222 static const char* DNA_LOOKUP;
223 static const char* CIGAR_LOOKUP;
227 // --------------------------------------------------------------------------------------
228 // static inline methods (internal - can exclude from main documentation)
230 // unpacks a buffer into an unsigned int
231 inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
232 union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
233 un.valueBuffer[0] = buffer[0];
234 un.valueBuffer[1] = buffer[1];
235 un.valueBuffer[2] = buffer[2];
236 un.valueBuffer[3] = buffer[3];
240 // unpacks a buffer into an unsigned short
241 inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
242 union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
243 un.valueBuffer[0] = buffer[0];
244 un.valueBuffer[1] = buffer[1];
248 // --------------------------------------------------------------------------------------
249 // template classes/methods (internal - can exclude from main documentation)
251 // allows sorting/searching of a vector of pairs (instead of using maps)
252 template <typename Key, typename Value>
253 class LookupKeyCompare {
255 typedef std::pair< Key, Value > LookupData;
256 typedef typename LookupData::first_type Key_t;
259 bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
260 bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
261 bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
263 bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
266 // return index of item if found, else return container.size()
267 template < typename InputIterator, typename EqualityComparable >
268 typename std::iterator_traits<InputIterator>::difference_type
269 Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
270 return std::distance(begin, std::find(begin, end, item));