1 // ***************************************************************************
\r
2 // BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 8 June 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // Provides the basic constants, data structures, etc. for using BAM files
\r
9 // ***************************************************************************
\r
20 #include <exception>
\r
26 // Platform-specific type definitions
\r
27 #ifndef BAMTOOLS_TYPES
\r
28 #define BAMTOOLS_TYPES
\r
30 typedef char int8_t;
\r
31 typedef unsigned char uint8_t;
\r
32 typedef short int16_t;
\r
33 typedef unsigned short uint16_t;
\r
34 typedef int int32_t;
\r
35 typedef unsigned int uint32_t;
\r
36 typedef long long int64_t;
\r
37 typedef unsigned long long uint64_t;
\r
41 #endif // BAMTOOLS_TYPES
\r
43 namespace BamTools {
\r
46 const int BAM_CORE_SIZE = 32;
\r
47 const int BAM_CMATCH = 0;
\r
48 const int BAM_CINS = 1;
\r
49 const int BAM_CDEL = 2;
\r
50 const int BAM_CREF_SKIP = 3;
\r
51 const int BAM_CSOFT_CLIP = 4;
\r
52 const int BAM_CHARD_CLIP = 5;
\r
53 const int BAM_CPAD = 6;
\r
54 const int BAM_CIGAR_SHIFT = 4;
\r
55 const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
\r
57 // BAM index constants
\r
58 const int MAX_BIN = 37450; // =(8^6-1)/7+1
\r
59 const int BAM_MIN_CHUNK_GAP = 32768;
\r
60 const int BAM_LIDX_SHIFT = 14;
\r
62 // Explicit variable sizes
\r
63 const int BT_SIZEOF_INT = 4;
\r
67 struct BamAlignment {
\r
69 // constructors & destructor
\r
72 BamAlignment(const BamAlignment& other);
\r
73 ~BamAlignment(void);
\r
75 // Queries against alignment flags
\r
77 bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate
\r
78 bool IsFailedQC(void) const; // Returns true if this read failed quality control
\r
79 bool IsFirstMate(void) const; // Returns true if alignment is first mate on read
\r
80 bool IsMapped(void) const; // Returns true if alignment is mapped
\r
81 bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped
\r
82 bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand
\r
83 bool IsPaired(void) const; // Returns true if alignment part of paired-end read
\r
84 bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment
\r
85 bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution
\r
86 bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand
\r
87 bool IsSecondMate(void) const; // Returns true if alignment is second mate on read
\r
89 // Manipulate alignment flags
\r
91 void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag
\r
92 void SetIsFailedQC(bool ok); // Sets "failed quality control" flag
\r
93 void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag
\r
94 void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag
\r
95 void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag
\r
96 void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag
\r
97 void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag
\r
98 void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag
\r
99 void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag
\r
100 void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag
\r
101 void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag
\r
103 // Tag data access methods
\r
105 bool GetEditDistance(uint8_t& editDistance) const; // get "NM" tag data - contributed by Aaron Quinlan
\r
106 bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data
\r
108 bool GetTag(const std::string& tag, std::string& destination);
\r
109 template<typename T> bool GetTag(const std::string& tag, T& destination);
\r
111 // Additional data access methods
\r
113 int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations
\r
115 // 'internal' utility methods
\r
117 static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);
\r
121 std::string Name; // Read name
\r
122 int32_t Length; // Query length
\r
123 std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
\r
124 std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
\r
125 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
\r
126 std::string TagData; // Tag data (accessor methods will pull the requested information out)
\r
127 int32_t RefID; // ID number for reference sequence
\r
128 int32_t Position; // Position (0-based) where alignment starts
\r
129 uint16_t Bin; // Bin in BAM file where this alignment resides
\r
130 uint16_t MapQuality; // Mapping quality score
\r
131 uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
\r
132 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
\r
133 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
\r
134 int32_t MatePosition; // Position (0-based) where alignment's mate starts
\r
135 int32_t InsertSize; // Mate-pair insert size
\r
137 // Alignment flag query constants
\r
138 // Use the get/set methods above instead
\r
143 , MATE_UNMAPPED = 8
\r
145 , MATE_REVERSE = 32
\r
150 , DUPLICATE = 1024
\r
154 // ----------------------------------------------------------------
\r
155 // Auxiliary data structs & typedefs
\r
157 struct BamAlignmentSupportData {
\r
160 std::string AllCharData;
\r
161 uint32_t BlockLength;
\r
162 uint32_t NumCigarOperations;
\r
163 uint32_t QueryNameLength;
\r
164 uint32_t QuerySequenceLength;
\r
167 BamAlignmentSupportData(void)
\r
169 , NumCigarOperations(0)
\r
170 , QueryNameLength(0)
\r
171 , QuerySequenceLength(0)
\r
178 char Type; // Operation type (MIDNSHP)
\r
179 uint32_t Length; // Operation length (number of bases)
\r
182 CigarOp(const char type = '\0',
\r
183 const uint32_t length = 0)
\r
192 std::string RefName; // Name of reference sequence
\r
193 int32_t RefLength; // Length of reference sequence
\r
194 bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
\r
197 RefData(const int32_t& length = 0,
\r
199 : RefLength(length)
\r
200 , RefHasAlignments(ok)
\r
204 typedef std::vector<RefData> RefVector;
\r
205 typedef std::vector<BamAlignment> BamAlignmentVector;
\r
207 // ----------------------------------------------------------------
\r
208 // Indexing structs & typedefs
\r
217 Chunk(const uint64_t& start = 0,
\r
218 const uint64_t& stop = 0)
\r
225 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
\r
226 return lhs.Start < rhs.Start;
\r
229 typedef std::vector<Chunk> ChunkVector;
\r
230 typedef std::map<uint32_t, ChunkVector> BamBinMap;
\r
231 typedef std::vector<uint64_t> LinearOffsetVector;
\r
233 struct ReferenceIndex {
\r
236 LinearOffsetVector Offsets;
\r
238 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
\r
239 const LinearOffsetVector& offsets = LinearOffsetVector())
\r
245 typedef std::vector<ReferenceIndex> BamIndex;
\r
247 // ----------------------------------------------------------------
\r
248 // BamAlignment member methods
\r
250 // constructors & destructor
\r
252 BamAlignment::BamAlignment(void) { }
\r
255 BamAlignment::BamAlignment(const BamAlignment& other)
\r
257 , Length(other.Length)
\r
258 , QueryBases(other.QueryBases)
\r
259 , AlignedBases(other.AlignedBases)
\r
260 , Qualities(other.Qualities)
\r
261 , TagData(other.TagData)
\r
262 , RefID(other.RefID)
\r
263 , Position(other.Position)
\r
265 , MapQuality(other.MapQuality)
\r
266 , AlignmentFlag(other.AlignmentFlag)
\r
267 , CigarData(other.CigarData)
\r
268 , MateRefID(other.MateRefID)
\r
269 , MatePosition(other.MatePosition)
\r
270 , InsertSize(other.InsertSize)
\r
274 BamAlignment::~BamAlignment(void) { }
\r
276 // Queries against alignment flags
\r
277 inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
\r
278 inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
\r
279 inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
\r
280 inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
\r
281 inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
\r
282 inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
\r
283 inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
\r
284 inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
\r
285 inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
\r
286 inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
\r
287 inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
\r
289 // Manipulate alignment flags
\r
290 inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
\r
291 inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
\r
292 inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
\r
293 inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
\r
294 inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
\r
295 inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
\r
296 inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
\r
297 inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
\r
298 inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
\r
299 inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
\r
300 inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
\r
302 // calculates alignment end position, based on starting position and CIGAR operations
\r
304 int BamAlignment::GetEndPosition(bool usePadded) const {
\r
306 // initialize alignment end to starting position
\r
307 int alignEnd = Position;
\r
309 // iterate over cigar operations
\r
310 std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
\r
311 std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
\r
312 for ( ; cigarIter != cigarEnd; ++cigarIter) {
\r
313 const char cigarType = (*cigarIter).Type;
\r
314 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
\r
315 alignEnd += (*cigarIter).Length;
\r
317 else if ( usePadded && cigarType == 'I' ) {
\r
318 alignEnd += (*cigarIter).Length;
\r
324 // get "NM" tag data - contributed by Aaron Quinlan
\r
325 // stores data in 'editDistance', returns success/fail
\r
327 bool BamAlignment::GetEditDistance(uint8_t& editDistance) const {
\r
329 if ( TagData.empty() ) { return false; }
\r
331 // localize the tag data
\r
332 char* pTagData = (char*)TagData.data();
\r
333 const unsigned int tagDataLen = TagData.size();
\r
334 unsigned int numBytesParsed = 0;
\r
336 bool foundEditDistanceTag = false;
\r
337 while( numBytesParsed < tagDataLen ) {
\r
339 const char* pTagType = pTagData;
\r
340 const char* pTagStorageType = pTagData + 2;
\r
342 numBytesParsed += 3;
\r
344 // check the current tag
\r
345 if ( strncmp(pTagType, "NM", 2) == 0 ) {
\r
346 foundEditDistanceTag = true;
\r
350 // get the storage class and find the next tag
\r
351 if (*pTagStorageType == '\0') { return false; }
\r
352 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
353 if (*pTagData == '\0') { return false; }
\r
355 // return if the edit distance tag was not present
\r
356 if ( !foundEditDistanceTag ) { return false; }
\r
358 // assign the editDistance value
\r
359 std::memcpy(&editDistance, pTagData, 1);
\r
363 // get "RG" tag data
\r
364 // stores data in 'readGroup', returns success/fail
\r
366 bool BamAlignment::GetReadGroup(std::string& readGroup) const {
\r
368 if ( TagData.empty() ) { return false; }
\r
370 // localize the tag data
\r
371 char* pTagData = (char*)TagData.data();
\r
372 const unsigned int tagDataLen = TagData.size();
\r
373 unsigned int numBytesParsed = 0;
\r
375 bool foundReadGroupTag = false;
\r
376 while( numBytesParsed < tagDataLen ) {
\r
378 const char* pTagType = pTagData;
\r
379 const char* pTagStorageType = pTagData + 2;
\r
381 numBytesParsed += 3;
\r
383 // check the current tag
\r
384 if ( std::strncmp(pTagType, "RG", 2) == 0 ) {
\r
385 foundReadGroupTag = true;
\r
389 // get the storage class and find the next tag
\r
390 if (*pTagStorageType == '\0') { return false; }
\r
391 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
392 if (*pTagData == '\0') { return false; }
\r
395 // return if the read group tag was not present
\r
396 if ( !foundReadGroupTag ) { return false; }
\r
398 // assign the read group
\r
399 const unsigned int readGroupLen = std::strlen(pTagData);
\r
400 readGroup.resize(readGroupLen);
\r
401 std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen );
\r
406 bool BamAlignment::GetTag(const std::string& tag, std::string& destination) {
\r
408 if ( TagData.empty() ) { return false; }
\r
410 // localize the tag data
\r
411 char* pTagData = (char*)TagData.data();
\r
412 const unsigned int tagDataLen = TagData.size();
\r
413 unsigned int numBytesParsed = 0;
\r
415 bool foundReadGroupTag = false;
\r
416 while( numBytesParsed < tagDataLen ) {
\r
418 const char* pTagType = pTagData;
\r
419 const char* pTagStorageType = pTagData + 2;
\r
421 numBytesParsed += 3;
\r
423 // check the current tag
\r
424 if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) {
\r
425 foundReadGroupTag = true;
\r
429 // get the storage class and find the next tag
\r
430 if (*pTagStorageType == '\0') { return false; }
\r
431 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
432 if (*pTagData == '\0') { return false; }
\r
435 // return if the read group tag was not present
\r
436 if ( !foundReadGroupTag ) { return false; }
\r
438 // assign the read group
\r
439 const unsigned int dataLen = std::strlen(pTagData);
\r
440 destination.resize(dataLen);
\r
441 std::memcpy( (char*)destination.data(), pTagData, dataLen );
\r
445 template<typename T>
\r
446 bool BamAlignment::GetTag(const std::string& tag, T& destination) {
\r
448 if ( TagData.empty() ) { return false; }
\r
450 // localize the tag data
\r
451 char* pTagData = (char*)TagData.data();
\r
452 const unsigned int tagDataLen = TagData.size();
\r
453 unsigned int numBytesParsed = 0;
\r
455 bool foundDesiredTag = false;
\r
456 while( numBytesParsed < tagDataLen ) {
\r
458 const char* pTagType = pTagData;
\r
459 const char* pTagStorageType = pTagData + 2;
\r
461 numBytesParsed += 3;
\r
463 // check the current tag
\r
464 if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) {
\r
465 foundDesiredTag = true;
\r
469 // get the storage class and find the next tag
\r
470 if (*pTagStorageType == '\0') { return false; }
\r
471 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
472 if (*pTagData == '\0') { return false; }
\r
474 // return if the edit distance tag was not present
\r
475 if ( !foundDesiredTag ) { return false; }
\r
477 // assign the editDistance value
\r
478 std::memcpy(&destination, pTagData, sizeof(T));
\r
483 void BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
\r
485 switch(storageType) {
\r
496 numBytesParsed += 2;
\r
503 numBytesParsed += 4;
\r
513 // ---------------------------
\r
514 // Added: 3-25-2010 DWB
\r
515 // Contributed: ARQ
\r
516 // Fixed: error parsing variable length tag data
\r
518 // ---------------------------
\r
522 printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
\r
527 // ----------------------------------------------------------------
\r
528 // Added: 3-35-2010 DWB
\r
529 // Fixed: Routines to provide endian-correctness
\r
530 // ----------------------------------------------------------------
\r
532 // returns true if system is big endian
\r
533 inline bool SystemIsBigEndian(void) {
\r
534 const uint16_t one = 0x0001;
\r
535 return ((*(char*) &one) == 0 );
\r
538 // swaps endianness of 16-bit value 'in place'
\r
539 inline void SwapEndian_16(int16_t& x) {
\r
540 x = ((x >> 8) | (x << 8));
\r
543 inline void SwapEndian_16(uint16_t& x) {
\r
544 x = ((x >> 8) | (x << 8));
\r
547 // swaps endianness of 32-bit value 'in-place'
\r
548 inline void SwapEndian_32(int32_t& x) {
\r
550 ((x << 8) & 0x00FF0000) |
\r
551 ((x >> 8) & 0x0000FF00) |
\r
556 inline void SwapEndian_32(uint32_t& x) {
\r
558 ((x << 8) & 0x00FF0000) |
\r
559 ((x >> 8) & 0x0000FF00) |
\r
564 // swaps endianness of 64-bit value 'in-place'
\r
565 inline void SwapEndian_64(int64_t& x) {
\r
567 ((x << 40) & 0x00FF000000000000ll) |
\r
568 ((x << 24) & 0x0000FF0000000000ll) |
\r
569 ((x << 8) & 0x000000FF00000000ll) |
\r
570 ((x >> 8) & 0x00000000FF000000ll) |
\r
571 ((x >> 24) & 0x0000000000FF0000ll) |
\r
572 ((x >> 40) & 0x000000000000FF00ll) |
\r
577 inline void SwapEndian_64(uint64_t& x) {
\r
579 ((x << 40) & 0x00FF000000000000ll) |
\r
580 ((x << 24) & 0x0000FF0000000000ll) |
\r
581 ((x << 8) & 0x000000FF00000000ll) |
\r
582 ((x >> 8) & 0x00000000FF000000ll) |
\r
583 ((x >> 24) & 0x0000000000FF0000ll) |
\r
584 ((x >> 40) & 0x000000000000FF00ll) |
\r
589 // swaps endianness of 'next 2 bytes' in a char buffer (in-place)
\r
590 inline void SwapEndian_16p(char* data) {
\r
591 uint16_t& value = (uint16_t&)*data;
\r
592 SwapEndian_16(value);
\r
595 // swaps endianness of 'next 4 bytes' in a char buffer (in-place)
\r
596 inline void SwapEndian_32p(char* data) {
\r
597 uint32_t& value = (uint32_t&)*data;
\r
598 SwapEndian_32(value);
\r
601 // swaps endianness of 'next 8 bytes' in a char buffer (in-place)
\r
602 inline void SwapEndian_64p(char* data) {
\r
603 uint64_t& value = (uint64_t&)*data;
\r
604 SwapEndian_64(value);
\r
607 } // namespace BamTools
\r