1 // ***************************************************************************
\r
2 // BamAux.h (c) 2009 Derek Barnett, Michael Strömberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 8 December 2009 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // Provides the basic constants, data structures, etc. for using BAM files
\r
9 // ***************************************************************************
\r
19 #include <exception>
\r
25 namespace BamTools {
\r
28 const int BAM_CORE_SIZE = 32;
\r
29 const int BAM_CMATCH = 0;
\r
30 const int BAM_CINS = 1;
\r
31 const int BAM_CDEL = 2;
\r
32 const int BAM_CREF_SKIP = 3;
\r
33 const int BAM_CSOFT_CLIP = 4;
\r
34 const int BAM_CHARD_CLIP = 5;
\r
35 const int BAM_CPAD = 6;
\r
36 const int BAM_CIGAR_SHIFT = 4;
\r
37 const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
\r
39 // BAM index constants
\r
40 const int MAX_BIN = 37450; // =(8^6-1)/7+1
\r
41 const int BAM_MIN_CHUNK_GAP = 32768;
\r
42 const int BAM_LIDX_SHIFT = 14;
\r
44 // Explicit variable sizes
\r
45 const int BT_SIZEOF_INT = 4;
\r
49 struct BamAlignment {
\r
51 // Queries against alignment flag
\r
53 // Returns true if this read is a PCR duplicate (determined by external app)
\r
54 bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
\r
55 // Returns true if this read failed quality control (determined by external app)
\r
56 bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
\r
57 // Returns true if alignment is first mate on read
\r
58 bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
\r
59 // Returns true if alignment is mapped
\r
60 bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
\r
61 // Returns true if alignment's mate is mapped
\r
62 bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
\r
63 // Returns true if alignment's mate mapped to reverse strand
\r
64 bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
\r
65 // Returns true if alignment part of paired-end read
\r
66 bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
\r
67 // Returns true if this position is primary alignment (determined by external app)
\r
68 bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
\r
69 // Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)
\r
70 bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
\r
71 // Returns true if alignment mapped to reverse strand
\r
72 bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
\r
73 // Returns true if alignment is second mate on read
\r
74 bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
\r
78 // get "RG" tag data
\r
79 bool GetReadGroup(std::string& readGroup) const {
\r
81 if ( TagData.empty() ) { return false; }
\r
83 // localize the tag data
\r
84 char* pTagData = (char*)TagData.data();
\r
85 const unsigned int tagDataLen = TagData.size();
\r
86 unsigned int numBytesParsed = 0;
\r
88 bool foundReadGroupTag = false;
\r
89 while( numBytesParsed < tagDataLen ) {
\r
91 const char* pTagType = pTagData;
\r
92 const char* pTagStorageType = pTagData + 2;
\r
94 numBytesParsed += 3;
\r
96 // check the current tag
\r
97 if ( std::strncmp(pTagType, "RG", 2) == 0 ) {
\r
98 foundReadGroupTag = true;
\r
102 // get the storage class and find the next tag
\r
103 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
106 // return if the read group tag was not present
\r
107 if ( !foundReadGroupTag ) { return false; }
\r
109 // assign the read group
\r
110 const unsigned int readGroupLen = std::strlen(pTagData);
\r
111 readGroup.resize(readGroupLen);
\r
112 std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen );
\r
116 // get "NM" tag data - contributed by Aaron Quinlan
\r
117 bool GetEditDistance(uint8_t& editDistance) const {
\r
119 if ( TagData.empty() ) { return false; }
\r
121 // localize the tag data
\r
122 char* pTagData = (char*)TagData.data();
\r
123 const unsigned int tagDataLen = TagData.size();
\r
124 unsigned int numBytesParsed = 0;
\r
126 bool foundEditDistanceTag = false;
\r
127 while( numBytesParsed < tagDataLen ) {
\r
129 const char* pTagType = pTagData;
\r
130 const char* pTagStorageType = pTagData + 2;
\r
132 numBytesParsed += 3;
\r
134 // check the current tag
\r
135 if ( strncmp(pTagType, "NM", 2) == 0 ) {
\r
136 foundEditDistanceTag = true;
\r
140 // get the storage class and find the next tag
\r
141 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
143 // return if the edit distance tag was not present
\r
144 if ( !foundEditDistanceTag ) { return false; }
\r
146 // assign the editDistance value
\r
147 memcpy(&editDistance, pTagData, 1);
\r
152 static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
\r
153 switch(storageType) {
\r
165 numBytesParsed += 2;
\r
171 numBytesParsed += 4;
\r
184 printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
\r
191 std::string Name; // Read name
\r
192 int32_t Length; // Query length
\r
193 std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
\r
194 std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
\r
195 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
\r
196 std::string TagData; // Tag data (accessor methods will pull the requested information out)
\r
197 int32_t RefID; // ID number for reference sequence
\r
198 int32_t Position; // Position (0-based) where alignment starts
\r
199 uint16_t Bin; // Bin in BAM file where this alignment resides
\r
200 uint16_t MapQuality; // Mapping quality score
\r
201 uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods for available queries
\r
202 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
\r
203 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
\r
204 int32_t MatePosition; // Position (0-based) where alignment's mate starts
\r
205 int32_t InsertSize; // Mate-pair insert size
\r
207 // Alignment flag query constants
\r
223 // ----------------------------------------------------------------
\r
224 // Auxiliary data structs & typedefs
\r
227 char Type; // Operation type (MIDNSHP)
\r
228 uint32_t Length; // Operation length (number of bases)
\r
233 std::string RefName; // Name of reference sequence
\r
234 unsigned int RefLength; // Length of reference sequence
\r
235 bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
\r
239 , RefHasAlignments(false)
\r
243 typedef std::vector<RefData> RefVector;
\r
244 typedef std::vector<BamAlignment> BamAlignmentVector;
\r
246 // ----------------------------------------------------------------
\r
247 // Indexing structs & typedefs
\r
254 Chunk(const uint64_t& start = 0, const uint64_t& stop = 0)
\r
261 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
\r
262 return lhs.Start < rhs.Start;
\r
265 typedef std::vector<Chunk> ChunkVector;
\r
266 typedef std::map<uint32_t, ChunkVector> BamBinMap;
\r
267 typedef std::vector<uint64_t> LinearOffsetVector;
\r
269 struct ReferenceIndex {
\r
272 LinearOffsetVector Offsets;
\r
274 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
\r
275 const LinearOffsetVector& offsets = LinearOffsetVector())
\r
281 typedef std::vector<ReferenceIndex> BamIndex;
\r
283 } // namespace BamTools
\r