1 // ***************************************************************************
\r
2 // BamAux.h (c) 2009 Derek Barnett, Michael Strömberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 11 Janaury 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // Provides the basic constants, data structures, etc. for using BAM files
\r
9 // ***************************************************************************
\r
20 #include <exception>
\r
26 // Platform-specific type definitions
\r
27 #ifndef BAMTOOLS_TYPES
\r
28 #define BAMTOOLS_TYPES
\r
30 typedef char int8_t;
\r
31 typedef unsigned char uint8_t;
\r
32 typedef short int16_t;
\r
33 typedef unsigned short uint16_t;
\r
34 typedef int int32_t;
\r
35 typedef unsigned int uint32_t;
\r
36 typedef long long int64_t;
\r
37 typedef unsigned long long uint64_t;
\r
41 #endif // BAMTOOLS_TYPES
\r
43 namespace BamTools {
\r
46 const int BAM_CORE_SIZE = 32;
\r
47 const int BAM_CMATCH = 0;
\r
48 const int BAM_CINS = 1;
\r
49 const int BAM_CDEL = 2;
\r
50 const int BAM_CREF_SKIP = 3;
\r
51 const int BAM_CSOFT_CLIP = 4;
\r
52 const int BAM_CHARD_CLIP = 5;
\r
53 const int BAM_CPAD = 6;
\r
54 const int BAM_CIGAR_SHIFT = 4;
\r
55 const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
\r
57 // BAM index constants
\r
58 const int MAX_BIN = 37450; // =(8^6-1)/7+1
\r
59 const int BAM_MIN_CHUNK_GAP = 32768;
\r
60 const int BAM_LIDX_SHIFT = 14;
\r
62 // Explicit variable sizes
\r
63 const int BT_SIZEOF_INT = 4;
\r
67 struct BamAlignment {
\r
69 // Queries against alignment flag
\r
71 // Returns true if this read is a PCR duplicate (determined by external app)
\r
72 bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
\r
73 // Returns true if this read failed quality control (determined by external app)
\r
74 bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
\r
75 // Returns true if alignment is first mate on read
\r
76 bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
\r
77 // Returns true if alignment is mapped
\r
78 bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
\r
79 // Returns true if alignment's mate is mapped
\r
80 bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
\r
81 // Returns true if alignment's mate mapped to reverse strand
\r
82 bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
\r
83 // Returns true if alignment part of paired-end read
\r
84 bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
\r
85 // Returns true if this position is primary alignment (determined by external app)
\r
86 bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
\r
87 // Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)
\r
88 bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
\r
89 // Returns true if alignment mapped to reverse strand
\r
90 bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
\r
91 // Returns true if alignment is second mate on read
\r
92 bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
\r
94 // Manipulate alignment flag
\r
96 // Sets "PCR duplicate" bit
\r
97 void SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
\r
98 // Sets "failed quality control" bit
\r
99 void SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
\r
100 // Sets "alignment is first mate" bit
\r
101 void SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
\r
102 // Sets "alignment's mate is mapped" bit
\r
103 void SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
\r
104 // Sets "alignment's mate mapped to reverse strand" bit
\r
105 void SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
\r
106 // Sets "alignment part of paired-end read" bit
\r
107 void SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
\r
108 // Sets "alignment is part of read that satisfied paired-end resolution" bit
\r
109 void SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
\r
110 // Sets "alignment mapped to reverse strand" bit
\r
111 void SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
\r
112 // Sets "position is primary alignment (determined by external app)"
\r
113 void SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
\r
114 // Sets "alignment is second mate on read" bit
\r
115 void SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
\r
116 // Sets "alignment is mapped" bit
\r
117 void SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
\r
121 // get "RG" tag data
\r
122 bool GetReadGroup(std::string& readGroup) const {
\r
124 if ( TagData.empty() ) { return false; }
\r
126 // localize the tag data
\r
127 char* pTagData = (char*)TagData.data();
\r
128 const unsigned int tagDataLen = TagData.size();
\r
129 unsigned int numBytesParsed = 0;
\r
131 bool foundReadGroupTag = false;
\r
132 while( numBytesParsed < tagDataLen ) {
\r
134 const char* pTagType = pTagData;
\r
135 const char* pTagStorageType = pTagData + 2;
\r
137 numBytesParsed += 3;
\r
139 // check the current tag
\r
140 if ( std::strncmp(pTagType, "RG", 2) == 0 ) {
\r
141 foundReadGroupTag = true;
\r
145 // get the storage class and find the next tag
\r
146 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
149 // return if the read group tag was not present
\r
150 if ( !foundReadGroupTag ) { return false; }
\r
152 // assign the read group
\r
153 const unsigned int readGroupLen = std::strlen(pTagData);
\r
154 readGroup.resize(readGroupLen);
\r
155 std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen );
\r
159 // get "NM" tag data - contributed by Aaron Quinlan
\r
160 bool GetEditDistance(uint8_t& editDistance) const {
\r
162 if ( TagData.empty() ) { return false; }
\r
164 // localize the tag data
\r
165 char* pTagData = (char*)TagData.data();
\r
166 const unsigned int tagDataLen = TagData.size();
\r
167 unsigned int numBytesParsed = 0;
\r
169 bool foundEditDistanceTag = false;
\r
170 while( numBytesParsed < tagDataLen ) {
\r
172 const char* pTagType = pTagData;
\r
173 const char* pTagStorageType = pTagData + 2;
\r
175 numBytesParsed += 3;
\r
177 // check the current tag
\r
178 if ( strncmp(pTagType, "NM", 2) == 0 ) {
\r
179 foundEditDistanceTag = true;
\r
183 // get the storage class and find the next tag
\r
184 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
186 // return if the edit distance tag was not present
\r
187 if ( !foundEditDistanceTag ) { return false; }
\r
189 // assign the editDistance value
\r
190 memcpy(&editDistance, pTagData, 1);
\r
195 static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
\r
196 switch(storageType) {
\r
208 numBytesParsed += 2;
\r
214 numBytesParsed += 4;
\r
227 printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
\r
234 std::string Name; // Read name
\r
235 int32_t Length; // Query length
\r
236 std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
\r
237 std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
\r
238 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
\r
239 std::string TagData; // Tag data (accessor methods will pull the requested information out)
\r
240 int32_t RefID; // ID number for reference sequence
\r
241 int32_t Position; // Position (0-based) where alignment starts
\r
242 uint16_t Bin; // Bin in BAM file where this alignment resides
\r
243 uint16_t MapQuality; // Mapping quality score
\r
244 uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
\r
245 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
\r
246 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
\r
247 int32_t MatePosition; // Position (0-based) where alignment's mate starts
\r
248 int32_t InsertSize; // Mate-pair insert size
\r
250 // Alignment flag query constants
\r
266 // ----------------------------------------------------------------
\r
267 // Auxiliary data structs & typedefs
\r
270 char Type; // Operation type (MIDNSHP)
\r
271 uint32_t Length; // Operation length (number of bases)
\r
276 std::string RefName; // Name of reference sequence
\r
277 int RefLength; // Length of reference sequence
\r
278 bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
\r
282 , RefHasAlignments(false)
\r
286 typedef std::vector<RefData> RefVector;
\r
287 typedef std::vector<BamAlignment> BamAlignmentVector;
\r
289 // ----------------------------------------------------------------
\r
290 // Indexing structs & typedefs
\r
297 Chunk(const uint64_t& start = 0, const uint64_t& stop = 0)
\r
304 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
\r
305 return lhs.Start < rhs.Start;
\r
308 typedef std::vector<Chunk> ChunkVector;
\r
309 typedef std::map<uint32_t, ChunkVector> BamBinMap;
\r
310 typedef std::vector<uint64_t> LinearOffsetVector;
\r
312 struct ReferenceIndex {
\r
315 LinearOffsetVector Offsets;
\r
317 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
\r
318 const LinearOffsetVector& offsets = LinearOffsetVector())
\r
324 typedef std::vector<ReferenceIndex> BamIndex;
\r
326 } // namespace BamTools
\r