5 Copyright (c) 2008 Genome Research Ltd (GRL).
\r
7 Permission is hereby granted, free of charge, to any person obtaining
\r
8 a copy of this software and associated documentation files (the
\r
9 "Software"), to deal in the Software without restriction, including
\r
10 without limitation the rights to use, copy, modify, merge, publish,
\r
11 distribute, sublicense, and/or sell copies of the Software, and to
\r
12 permit persons to whom the Software is furnished to do so, subject to
\r
13 the following conditions:
\r
15 The above copyright notice and this permission notice shall be
\r
16 included in all copies or substantial portions of the Software.
\r
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
\r
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
\r
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
\r
21 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
\r
22 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
\r
23 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
\r
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
\r
29 Implementation of BAM-parsing was translated to C++ directly from Heng Li's SAMtools package
\r
30 (thus the carryover of above MIT license)
\r
31 Contact: Derek Barnett <barnetde@bc.edu>
\r
35 // Marth Lab, Boston College
\r
36 // Last modified: 6 April 2009
\r
42 #include "BamAlignment.h"
\r
43 #include "STLUtilities.h"
\r
45 // C library includes
\r
52 // BGZF library includes/defines
\r
54 typedef BGZF* BamFile;
\r
55 #define bam_open(f_name, mode) bgzf_open(f_name, mode)
\r
56 #define bam_close(f_ptr) bgzf_close(f_ptr)
\r
57 #define bam_read(f_ptr, buf, size) bgzf_read(f_ptr, buf, size)
\r
58 #define bam_write(f_ptr, buf, size) bgzf_write(f_ptr, buf, size)
\r
59 #define bam_tell(f_ptr) bgzf_tell(f_ptr)
\r
60 #define bam_seek(f_ptr, pos, dir) bgzf_seek(f_ptr, pos, dir)
\r
62 // size of alignment data block in BAM file (bytes)
\r
63 #define BAM_CORE_SIZE 32
\r
65 // BAM indexing constants
\r
66 #define MAX_BIN 37450 // =(8^6-1)/7+1
\r
67 #define BAM_MIN_CHUNK_GAP 32768
\r
68 #define BAM_LIDX_SHIFT 14
\r
70 // CIGAR-retrieval mask/shift constants
\r
71 #define BAM_CIGAR_SHIFT 4
\r
72 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
\r
74 // CIGAR-operation types
\r
75 #define BAM_CMATCH 0
\r
78 #define BAM_CREF_SKIP 3
\r
79 #define BAM_CSOFT_CLIP 4
\r
80 #define BAM_CHARD_CLIP 5
\r
83 // --------------------------- //
\r
85 // --------------------------- //
\r
87 // --------------------------- //
\r
88 // BamIndex-related typedefs
\r
89 // --------------------------- //
\r
91 // offset for linear indexing
\r
92 typedef vector<uint64_t> LinearOffsetVector;
\r
95 typedef pair<uint64_t, uint64_t> ChunkPair;
\r
96 // list of chunks in a BAM bin
\r
97 typedef vector<ChunkPair> ChunkVector;
\r
99 // BAM bins for a reference sequence
\r
100 // replaces khash - uint32_t is key, ChunkVector is value
\r
101 typedef pair<uint32_t, ChunkVector*> BamBin;
\r
102 typedef vector<BamBin> BinVector;
\r
104 // each reference sequence has a BinVector and LinearOffsetVector
\r
105 typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
\r
107 // full BamIndex defined as:
\r
108 typedef vector<RefIndex*> BamIndex;
\r
110 // ---------------------------------------------------------------------------//
\r
116 BamReader(const char* fileName = NULL, const char* indexFilename = NULL);
\r
122 // BAM interface methods
\r
125 // ----------------------- //
\r
126 // File manipulation
\r
127 // ----------------------- //
\r
129 // open BAM file (automatically opens index if provided)
\r
132 // open BAM index (allows index to be opened separately - i.e. sometime after the BAM file is opened)
\r
133 bool OpenIndex(void);
\r
138 // get BAM filename
\r
139 const char* Filename(void) const;
\r
141 // set BAM filename
\r
142 void SetFilename(const char*);
\r
144 // get BAM Index filename
\r
145 const char* IndexFilename(void) const;
\r
147 // set BAM Index filename
\r
148 void SetIndexFilename(const char*);
\r
150 // ----------------------- //
\r
151 // Access BAM header
\r
152 // ----------------------- //
\r
154 // return full header text
\r
155 const string GetHeaderText(void) const;
\r
157 // --------------------------------- //
\r
158 // Access reference sequence info
\r
159 // --------------------------------- //
\r
161 // return number of reference sequences in BAM file
\r
162 const int GetReferenceCount(void) const;
\r
164 // return vector of RefData entries
\r
165 const RefVector GetReferenceData(void) const;
\r
167 // get refID from reference name
\r
168 const int GetRefID(string refName) const;
\r
170 // ----------------------------------------- //
\r
171 // File position moving
\r
172 // ----------------------------------------- //
\r
174 // jumps to 'left' position on refID
\r
175 // actually jumps before position, so reads that overlap 'left' are included as well
\r
176 // 'left' defaults to reference begin if not specified
\r
177 bool Jump(int refID, unsigned int left = 0);
\r
179 // Jump to beginning of BAM file, clears any region previously set by Jump()
\r
182 // ------------------------------ //
\r
183 // Access alignments
\r
184 // ------------------------------ //
\r
186 // get next alignment
\r
187 bool GetNextAlignment(BamAlignment& read);
\r
189 // allow user to specifiy whether 'AlignedBases' string is calculated when alignment is loaded
\r
190 void SetCalculateAlignedBases(bool);
\r
192 // internal utility methods
\r
194 int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
\r
195 uint32_t CalculateAlignmentEnd(const unsigned int&, const vector<CigarOp>&);
\r
196 uint64_t GetOffset(int, unsigned int);
\r
197 bool IsOverlap(BamAlignment&);
\r
198 bool LoadHeader(void);
\r
199 bool LoadIndex(void);
\r
200 bool LoadNextAlignment(BamAlignment&);
\r
203 // main BAM reader components
\r
205 char* m_indexFilename;
\r
208 RefVector m_references;
\r
209 string m_headerText;
\r
212 bool m_isOpen; // BAM file is open for processing
\r
213 bool m_isIndexLoaded; // BAM Index data is loaded and available for processing
\r
214 bool m_isRegionSpecified; // a region has been specified - specifically, a user has called Jump()
\r
215 bool m_isCalculateAlignedBases; // build 'AlignedBases' string when getting an alignment, otherwise skip (default = true)
\r
218 int m_currentRefID;
\r
219 unsigned int m_currentLeft;
\r
221 // file offset of 1st read in BAM file
\r
222 int64_t m_alignmentsBeginOffset;
\r
225 // BAM character constants
\r
226 static const char* DNA_LOOKUP;
\r
227 static const char* CIGAR_LOOKUP;
\r
230 #endif /* BAMREADER_H */
\r