]> git.donarmstrong.com Git - bamtools.git/blob - BamReader.h
Major overhaul of BamReader. No longer relying on bgzf.* API. Sped up random-access...
[bamtools.git] / BamReader.h
1 // ***************************************************************************
2 // BamReader (c) 2009 Derek Barnett
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Provides the basic functionality for reading BAM files
7 // ***************************************************************************
8
9 #pragma once
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <zlib.h>
15
16 #include <algorithm>
17 #include <string>
18 #include <utility>
19 #include <vector>
20 using namespace std;
21
22 #include "BamAlignment.h"
23
24 // our zlib constants
25 #define GZIP_ID1             31
26 #define GZIP_ID2            139
27 #define CM_DEFLATE            8
28 #define FLG_FEXTRA            4
29 #define OS_UNKNOWN          255
30 #define BGZF_XLEN             6
31 #define BGZF_ID1             66
32 #define BGZF_ID2             67
33 #define BGZF_LEN              2
34 #define GZIP_WINDOW_BITS    -15
35 #define Z_DEFAULT_MEM_LEVEL   8
36
37 // our BZGF constants
38 #define BLOCK_HEADER_LENGTH    18
39 #define BLOCK_FOOTER_LENGTH     8
40 #define MAX_BLOCK_SIZE      65536
41 #define DEFAULT_BLOCK_SIZE  65536
42
43 // our BAM constants
44 #define BAM_CORE_SIZE  32
45 #define BAM_CMATCH      0
46 #define BAM_CINS        1
47 #define BAM_CDEL        2
48 #define BAM_CREF_SKIP   3
49 #define BAM_CSOFT_CLIP  4
50 #define BAM_CHARD_CLIP  5
51 #define BAM_CPAD        6
52 #define BAM_CIGAR_SHIFT 4
53 #define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
54
55 // BAM indexing constants
56 #define MAX_BIN           37450 // =(8^6-1)/7+1
57 #define BAM_MIN_CHUNK_GAP 32768 
58 #define BAM_LIDX_SHIFT    14
59
60 // our variable sizes
61 #define SIZEOF_INT 4
62
63 // define our BZGF structure
64 #ifndef BGZF_DATA
65 #define BGZF_DATA
66 struct BgzfData {
67         unsigned int UncompressedBlockSize;
68         unsigned int CompressedBlockSize;
69         unsigned int BlockLength;
70         unsigned int BlockOffset;
71         uint64_t BlockAddress;
72         bool IsOpen;
73         FILE* Stream;
74         char* UncompressedBlock;
75         char* CompressedBlock;
76
77         // constructor
78         BgzfData(void)
79                 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
80                 , CompressedBlockSize(MAX_BLOCK_SIZE)
81                 , BlockLength(0)
82                 , BlockOffset(0)
83                 , BlockAddress(0)
84                 , IsOpen(false)
85                 , Stream(NULL)
86                 , UncompressedBlock(NULL)
87                 , CompressedBlock(NULL)
88         {
89                 try {
90                         CompressedBlock   = new char[CompressedBlockSize];
91                         UncompressedBlock = new char[UncompressedBlockSize];
92                 } catch(bad_alloc&) {
93                         printf("ERROR: Unable to allocate memory for our BGZF object.\n");
94                         exit(1);
95                 }
96         }
97
98         // destructor
99         ~BgzfData(void) {
100                 if(CompressedBlock)   delete [] CompressedBlock;
101                 if(UncompressedBlock) delete [] UncompressedBlock;
102         }
103 };
104 #endif // BGZF_DATA
105
106
107 // --------------------------- //
108 // BamIndex-related typedefs
109 // --------------------------- //
110
111 // offset for linear indexing
112 typedef vector<uint64_t> LinearOffsetVector;
113
114 // alignment 'chunk' boundaries
115 typedef pair<uint64_t, uint64_t> ChunkPair;
116 typedef vector<ChunkPair> ChunkVector;
117
118 // BAM bins.. each contains (binID, alignment 'chunks')
119 typedef pair<uint32_t, ChunkVector*> BamBin;
120 typedef vector<BamBin> BinVector;
121
122 // each reference sequence has a BinVector and LinearOffsetVector
123 typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
124
125 // full BamIndex defined as: 
126 typedef vector<RefIndex*> BamIndex;
127
128 class BamReader {
129
130         // constructor/destructor
131         public:
132                 BamReader(void);
133                 ~BamReader(void);
134
135         // public interface
136         public:
137                 // closes the BAM file
138                 void Close(void);
139                 // retrieves header text
140                 const string GetHeaderText(void) const;
141                 // saves the alignment to the alignment archive
142                 bool GetNextAlignment(BamAlignment& bAlignment);
143                 // return number of reference sequences in BAM file
144                 const int GetReferenceCount(void) const;
145                 // return vector of RefData entries
146                 const RefVector GetReferenceData(void) const;
147                 // get refID from reference name
148                 const int GetReferenceID(const string& refName) const;
149                 // jumps to 'left' position on refID
150                 bool Jump(int refID, unsigned int left = 0);
151                 // opens the BAM file
152                 void Open(const string& filename, const string& indexFilename = "");
153                 // move file pointer back to first alignment
154                 bool Rewind(void);
155
156         // internal methods
157         private:
158                 // checks BGZF block header
159                 bool BgzfCheckBlockHeader(char* header);
160                 // closes the BAM file
161                 void BgzfClose(void);
162                 // de-compresses the current block
163                 int BgzfInflateBlock(int blockLength);
164                 // opens the BAM file for reading
165                 void BgzfOpen(const string& filename);
166                 // reads BGZF data into a byte buffer
167                 unsigned int BgzfRead(char* data, const unsigned int dataLen);
168                 // reads BGZF block
169                 int BgzfReadBlock(void);
170                 // seek to position in BAM file
171                 bool BgzfSeek(int64_t position);
172                 // get file position in BAM file
173                 int64_t BgzfTell(void);
174                 // unpacks a buffer into an unsigned int
175                 static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
176                 // unpacks a buffer into an unsigned short
177                 static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
178                 //      calculate bins that overlap region ( left to reference end for now )
179                 int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
180                 // calculates alignment end position based on starting position and provided CIGAR operations
181                 unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
182                 // clear out (delete pointers in) index data structure
183                 void ClearIndex(void);
184                 // calculate file offset for first alignment chunk overlapping 'left'
185                 int64_t GetOffset(int refID, unsigned int left);
186                 // checks to see if alignment overlaps current region
187                 bool IsOverlap(BamAlignment& bAlignment);
188                 // retrieves header text from BAM file
189                 void LoadHeaderData(void);
190                 // builds BamIndex data structure from BAM index file
191                 void LoadIndexData(FILE* indexStream);
192                 // retrieves BAM alignment under file pointer
193                 bool LoadNextAlignment(BamAlignment& bAlignment);
194                 // builds reference data structure from BAM file
195                 void LoadReferenceData(void);
196                 // open BAM index file (if successful, loads index)
197                 void OpenIndex(const string& indexFilename);
198         
199         // aligment file & index file data
200         private:
201                 BgzfData  m_BGZF;
202                 string    m_headerText;
203                 BamIndex* m_index;
204                 RefVector m_references;
205                 bool      m_isIndexLoaded;
206                 int64_t   m_alignmentsBeginOffset;
207
208         // user-specified region values
209         private:
210                 bool         m_isRegionSpecified;
211                 int          m_currentRefID;
212                 unsigned int m_currentLeft;
213
214         // BAM character constants
215         private:
216                 static const char* DNA_LOOKUP;
217                 static const char* CIGAR_LOOKUP;
218 };
219
220 // unpacks a buffer into an unsigned int
221 inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
222         union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
223         un.valueBuffer[0] = buffer[0];
224         un.valueBuffer[1] = buffer[1];
225         un.valueBuffer[2] = buffer[2];
226         un.valueBuffer[3] = buffer[3];
227         return un.value;
228 }
229
230 // unpacks a buffer into an unsigned short
231 inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
232         union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
233         un.valueBuffer[0] = buffer[0];
234         un.valueBuffer[1] = buffer[1];
235         return un.value;
236 }
237
238 // allows sorting/searching of a vector of pairs (instead of using maps)
239 template <typename Key, typename Value>
240 class LookupKeyCompare {
241
242         typedef pair< Key, Value > LookupData;
243         typedef typename LookupData::first_type Key_t;
244         
245         public:
246                 bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
247                 bool operator() (const LookupData& lhs, const Key_t& k) const        { return keyLess(lhs.first, k); }
248                 bool operator() (const Key_t& k, const LookupData& rhs) const        { return keyLess(k, rhs.first); }
249         private:
250                 bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
251 };
252
253 // return index of item if found, else return container.size()
254 template < typename InputIterator, typename EqualityComparable >
255 typename iterator_traits<InputIterator>::difference_type
256 Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
257         return distance(begin, find(begin, end, item));
258 }
259