1 // ***************************************************************************
2 // BamReader (c) 2009 Derek Barnett
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Provides the basic functionality for reading BAM files
7 // ***************************************************************************
22 #include "BamAlignment.h"
29 #define OS_UNKNOWN 255
34 #define GZIP_WINDOW_BITS -15
35 #define Z_DEFAULT_MEM_LEVEL 8
38 #define BLOCK_HEADER_LENGTH 18
39 #define BLOCK_FOOTER_LENGTH 8
40 #define MAX_BLOCK_SIZE 65536
41 #define DEFAULT_BLOCK_SIZE 65536
44 #define BAM_CORE_SIZE 32
48 #define BAM_CREF_SKIP 3
49 #define BAM_CSOFT_CLIP 4
50 #define BAM_CHARD_CLIP 5
52 #define BAM_CIGAR_SHIFT 4
53 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
55 // BAM indexing constants
56 #define MAX_BIN 37450 // =(8^6-1)/7+1
57 #define BAM_MIN_CHUNK_GAP 32768
58 #define BAM_LIDX_SHIFT 14
63 // define our BZGF structure
67 unsigned int UncompressedBlockSize;
68 unsigned int CompressedBlockSize;
69 unsigned int BlockLength;
70 unsigned int BlockOffset;
71 uint64_t BlockAddress;
74 char* UncompressedBlock;
75 char* CompressedBlock;
79 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
80 , CompressedBlockSize(MAX_BLOCK_SIZE)
86 , UncompressedBlock(NULL)
87 , CompressedBlock(NULL)
90 CompressedBlock = new char[CompressedBlockSize];
91 UncompressedBlock = new char[UncompressedBlockSize];
93 printf("ERROR: Unable to allocate memory for our BGZF object.\n");
100 if(CompressedBlock) delete [] CompressedBlock;
101 if(UncompressedBlock) delete [] UncompressedBlock;
107 // --------------------------- //
108 // BamIndex-related typedefs
109 // --------------------------- //
111 // offset for linear indexing
112 typedef vector<uint64_t> LinearOffsetVector;
114 // alignment 'chunk' boundaries
115 typedef pair<uint64_t, uint64_t> ChunkPair;
116 typedef vector<ChunkPair> ChunkVector;
118 // BAM bins.. each contains (binID, alignment 'chunks')
119 typedef pair<uint32_t, ChunkVector*> BamBin;
120 typedef vector<BamBin> BinVector;
122 // each reference sequence has a BinVector and LinearOffsetVector
123 typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
125 // full BamIndex defined as:
126 typedef vector<RefIndex*> BamIndex;
130 // constructor/destructor
137 // closes the BAM file
139 // retrieves header text
140 const string GetHeaderText(void) const;
141 // saves the alignment to the alignment archive
142 bool GetNextAlignment(BamAlignment& bAlignment);
143 // return number of reference sequences in BAM file
144 const int GetReferenceCount(void) const;
145 // return vector of RefData entries
146 const RefVector GetReferenceData(void) const;
147 // get refID from reference name
148 const int GetReferenceID(const string& refName) const;
149 // jumps to 'left' position on refID
150 bool Jump(int refID, unsigned int left = 0);
151 // opens the BAM file
152 void Open(const string& filename, const string& indexFilename = "");
153 // move file pointer back to first alignment
158 // checks BGZF block header
159 bool BgzfCheckBlockHeader(char* header);
160 // closes the BAM file
161 void BgzfClose(void);
162 // de-compresses the current block
163 int BgzfInflateBlock(int blockLength);
164 // opens the BAM file for reading
165 void BgzfOpen(const string& filename);
166 // reads BGZF data into a byte buffer
167 unsigned int BgzfRead(char* data, const unsigned int dataLen);
169 int BgzfReadBlock(void);
170 // seek to position in BAM file
171 bool BgzfSeek(int64_t position);
172 // get file position in BAM file
173 int64_t BgzfTell(void);
174 // unpacks a buffer into an unsigned int
175 static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
176 // unpacks a buffer into an unsigned short
177 static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
178 // calculate bins that overlap region ( left to reference end for now )
179 int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
180 // calculates alignment end position based on starting position and provided CIGAR operations
181 unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
182 // clear out (delete pointers in) index data structure
183 void ClearIndex(void);
184 // calculate file offset for first alignment chunk overlapping 'left'
185 int64_t GetOffset(int refID, unsigned int left);
186 // checks to see if alignment overlaps current region
187 bool IsOverlap(BamAlignment& bAlignment);
188 // retrieves header text from BAM file
189 void LoadHeaderData(void);
190 // builds BamIndex data structure from BAM index file
191 void LoadIndexData(FILE* indexStream);
192 // retrieves BAM alignment under file pointer
193 bool LoadNextAlignment(BamAlignment& bAlignment);
194 // builds reference data structure from BAM file
195 void LoadReferenceData(void);
196 // open BAM index file (if successful, loads index)
197 void OpenIndex(const string& indexFilename);
199 // aligment file & index file data
204 RefVector m_references;
205 bool m_isIndexLoaded;
206 int64_t m_alignmentsBeginOffset;
208 // user-specified region values
210 bool m_isRegionSpecified;
212 unsigned int m_currentLeft;
214 // BAM character constants
216 static const char* DNA_LOOKUP;
217 static const char* CIGAR_LOOKUP;
220 // unpacks a buffer into an unsigned int
221 inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
222 union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
223 un.valueBuffer[0] = buffer[0];
224 un.valueBuffer[1] = buffer[1];
225 un.valueBuffer[2] = buffer[2];
226 un.valueBuffer[3] = buffer[3];
230 // unpacks a buffer into an unsigned short
231 inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
232 union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
233 un.valueBuffer[0] = buffer[0];
234 un.valueBuffer[1] = buffer[1];
238 // allows sorting/searching of a vector of pairs (instead of using maps)
239 template <typename Key, typename Value>
240 class LookupKeyCompare {
242 typedef pair< Key, Value > LookupData;
243 typedef typename LookupData::first_type Key_t;
246 bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
247 bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
248 bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
250 bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
253 // return index of item if found, else return container.size()
254 template < typename InputIterator, typename EqualityComparable >
255 typename iterator_traits<InputIterator>::difference_type
256 Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
257 return distance(begin, find(begin, end, item));