5 Copyright (c) 2008 Genome Research Ltd (GRL).
\r
7 Permission is hereby granted, free of charge, to any person obtaining
\r
8 a copy of this software and associated documentation files (the
\r
9 "Software"), to deal in the Software without restriction, including
\r
10 without limitation the rights to use, copy, modify, merge, publish,
\r
11 distribute, sublicense, and/or sell copies of the Software, and to
\r
12 permit persons to whom the Software is furnished to do so, subject to
\r
13 the following conditions:
\r
15 The above copyright notice and this permission notice shall be
\r
16 included in all copies or substantial portions of the Software.
\r
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
\r
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
\r
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
\r
21 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
\r
22 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
\r
23 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
\r
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
\r
29 Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package
\r
30 (thus the carryover of above MIT license)
\r
31 Contact: Derek Barnett <barnetde@bc.edu>
\r
35 // Marth Lab, Boston College
\r
36 // Last modified: 23 April 2009
\r
42 #include "BamAlignment.h"
\r
43 #include "STLUtilities.h"
\r
45 // C library includes
\r
52 typedef char int8_t;
\r
53 typedef unsigned char uint8_t;
\r
54 typedef short int16_t;
\r
55 typedef unsigned short uint16_t;
\r
56 typedef int int32_t;
\r
57 typedef unsigned int uint32_t;
\r
58 typedef long long int64_t;
\r
59 typedef unsigned long long uint64_t;
\r
64 // BGZF library includes/defines
\r
66 typedef BGZF* BamFile;
\r
67 #define bam_open(f_name, mode) bgzf_open(f_name, mode)
\r
68 #define bam_close(f_ptr) bgzf_close(f_ptr)
\r
69 #define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size)
\r
70 #define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)
\r
71 #define bam_tell(f_ptr) bgzf_tell(f_ptr)
\r
72 #define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir)
\r
74 // size of alignment data block in BAM file (bytes)
\r
75 #define BAM_CORE_SIZE 32
\r
77 // BAM indexing constants
\r
78 #define MAX_BIN 37450 // =(8^6-1)/7+1
\r
79 #define BAM_MIN_CHUNK_GAP 32768
\r
80 #define BAM_LIDX_SHIFT 14
\r
82 // CIGAR-retrieval mask/shift constants
\r
83 #define BAM_CIGAR_SHIFT 4
\r
84 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
\r
86 // CIGAR-operation types
\r
87 #define BAM_CMATCH 0
\r
90 #define BAM_CREF_SKIP 3
\r
91 #define BAM_CSOFT_CLIP 4
\r
92 #define BAM_CHARD_CLIP 5
\r
95 // --------------------------- //
\r
97 // --------------------------- //
\r
99 // --------------------------- //
\r
100 // BamIndex-related typedefs
\r
101 // --------------------------- //
\r
103 // offset for linear indexing
\r
104 typedef vector<uint64_t> LinearOffsetVector;
\r
106 // chunk boundaries
\r
107 typedef pair<uint64_t, uint64_t> ChunkPair;
\r
108 // list of chunks in a BAM bin
\r
109 typedef vector<ChunkPair> ChunkVector;
\r
111 // BAM bins for a reference sequence
\r
112 // replaces khash - uint32_t is key, ChunkVector is value
\r
113 typedef pair<uint32_t, ChunkVector*> BamBin;
\r
114 typedef vector<BamBin> BinVector;
\r
116 // each reference sequence has a BinVector and LinearOffsetVector
\r
117 typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
\r
119 // full BamIndex defined as:
\r
120 typedef vector<RefIndex*> BamIndex;
\r
122 // ---------------------------------------------------------------------------//
\r
128 BamReader(const char* fileName = NULL, const char* indexFilename = NULL);
\r
134 // BAM interface methods
\r
137 // ----------------------- //
\r
138 // File manipulation
\r
139 // ----------------------- //
\r
141 // open BAM file (automatically opens index if provided)
\r
144 // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)
\r
145 bool OpenIndex(void);
\r
150 // get BAM filename
\r
151 const char* Filename(void) const;
\r
153 // set BAM filename
\r
154 void SetFilename(const char*);
\r
156 // get BAM Index filename
\r
157 const char* IndexFilename(void) const;
\r
159 // set BAM Index filename
\r
160 void SetIndexFilename(const char*);
\r
162 // ----------------------- //
\r
163 // Access BAM header
\r
164 // ----------------------- //
\r
166 // return full header text
\r
167 const string GetHeaderText(void) const;
\r
169 // --------------------------------- //
\r
170 // Access reference sequence info
\r
171 // --------------------------------- //
\r
173 // return number of reference sequences in BAM file
\r
174 const int GetReferenceCount(void) const;
\r
176 // return vector of RefData entries
\r
177 const RefVector GetReferenceData(void) const;
\r
179 // get refID from reference name
\r
180 const int GetRefID(string refName) const;
\r
182 // ----------------------------------------- //
\r
183 // File position moving
\r
184 // ----------------------------------------- //
\r
186 // jumps to 'left' position on refID
\r
187 // actually jumps before position, so reads that overlap 'left' are included as well
\r
188 // 'left' defaults to reference begin if not specified
\r
189 bool Jump(int refID, unsigned int left = 0);
\r
191 // Jump to beginning of BAM file, clears any region previously set by Jump()
\r
194 // ------------------------------ //
\r
195 // Access alignments
\r
196 // ------------------------------ //
\r
198 // get next alignment
\r
199 bool GetNextAlignment(BamAlignment& read);
\r
201 // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded
\r
202 void SetCalculateAlignedBases(bool);
\r
204 // internal utility methods
\r
206 int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
\r
207 uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);
\r
208 int64_t GetOffset(int, unsigned int);
\r
209 bool IsOverlap(BamAlignment&);
\r
210 bool LoadHeader(void);
\r
211 bool LoadIndex(void);
\r
212 bool LoadNextAlignment(BamAlignment&);
\r
215 // main BAM reader components
\r
217 char* m_indexFilename;
\r
220 RefVector m_references;
\r
221 string m_headerText;
\r
224 bool m_isOpen; // BAM file is open for processing
\r
225 bool m_isIndexLoaded; // BAM Index data is loaded and available for processing
\r
226 bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump()
\r
227 bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)
\r
230 int m_currentRefID;
\r
231 unsigned int m_currentLeft;
\r
233 // file offset of 1st read in BAM file
\r
234 int64_t m_alignmentsBeginOffset;
\r
237 // BAM character constants
\r
238 static const char* DNA_LOOKUP;
\r
239 static const char* CIGAR_LOOKUP;
\r
242 #endif /* BAMREADER_H */
\r