1 // ***************************************************************************
\r
2 // BamAux.h (c) 2009 Derek Barnett, Michael Strömberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 24 June 2009 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Defines common constants, typedefs, & data structures for BamTools.
\r
12 // ***************************************************************************
\r
16 \brief BamTools constants, typedefs, & data structures
\r
22 #include <exception>
\r
32 // Platform-specific type definitions
\r
34 typedef char int8_t;
\r
35 typedef unsigned char uint8_t;
\r
36 typedef short int16_t;
\r
37 typedef unsigned short uint16_t;
\r
38 typedef int int32_t;
\r
39 typedef unsigned int uint32_t;
\r
40 typedef long long int64_t;
\r
41 typedef unsigned long long uint64_t;
\r
46 //! \namespace BamTools
\r
47 namespace BamTools {
\r
50 // --------------------------------------------------------------------------------------
\r
51 // This section is purely internal and can be excluded from main generated documentation.
\r
54 const int GZIP_ID1 = 31;
\r
55 const int GZIP_ID2 = 139;
\r
56 const int CM_DEFLATE = 8;
\r
57 const int FLG_FEXTRA = 4;
\r
58 const int OS_UNKNOWN = 255;
\r
59 const int BGZF_XLEN = 6;
\r
60 const int BGZF_ID1 = 66;
\r
61 const int BGZF_ID2 = 67;
\r
62 const int BGZF_LEN = 2;
\r
63 const int GZIP_WINDOW_BITS = -15;
\r
64 const int Z_DEFAULT_MEM_LEVEL = 8;
\r
67 const int BLOCK_HEADER_LENGTH = 18;
\r
68 const int BLOCK_FOOTER_LENGTH = 8;
\r
69 const int MAX_BLOCK_SIZE = 65536;
\r
70 const int DEFAULT_BLOCK_SIZE = 65536;
\r
73 const unsigned int BAM_CORE_SIZE = 32;
\r
74 const int BAM_CMATCH = 0;
\r
75 const int BAM_CINS = 1;
\r
76 const int BAM_CDEL = 2;
\r
77 const int BAM_CREF_SKIP = 3;
\r
78 const int BAM_CSOFT_CLIP = 4;
\r
79 const int BAM_CHARD_CLIP = 5;
\r
80 const int BAM_CPAD = 6;
\r
81 const int BAM_CIGAR_SHIFT = 4;
\r
82 const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
\r
84 // BAM index constants
\r
85 const int MAX_BIN = 37450; // =(8^6-1)/7+1
\r
86 const int BAM_MIN_CHUNK_GAP = 32768;
\r
87 const int BAM_LIDX_SHIFT = 14;
\r
89 // Explicit variable sizes
\r
90 const int SIZEOF_INT = 4;
\r
93 unsigned int UncompressedBlockSize;
\r
94 unsigned int CompressedBlockSize;
\r
95 unsigned int BlockLength;
\r
96 unsigned int BlockOffset;
\r
97 uint64_t BlockAddress;
\r
100 char* UncompressedBlock;
\r
101 char* CompressedBlock;
\r
105 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
\r
106 , CompressedBlockSize(MAX_BLOCK_SIZE)
\r
112 , UncompressedBlock(NULL)
\r
113 , CompressedBlock(NULL)
\r
116 CompressedBlock = new char[CompressedBlockSize];
\r
117 UncompressedBlock = new char[UncompressedBlockSize];
\r
118 } catch( std::bad_alloc& ba ) {
\r
119 printf("ERROR: Unable to allocate memory for our BGZF object.\n");
\r
126 if(CompressedBlock) delete [] CompressedBlock;
\r
127 if(UncompressedBlock) delete [] UncompressedBlock;
\r
132 // --------------------------------------------------------------------------------------
\r
135 //! \brief Cigar operation data structure
\r
137 char Type; //!< Operation type (MIDNSHP)
\r
138 uint32_t Length; //!< Operation length (number of bases)
\r
142 //! Reference sequence data structure
\r
144 std::string RefName; //!< Name of reference sequence
\r
145 unsigned int RefLength; //!< Length of reference sequence
\r
146 bool RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence
\r
151 , RefHasAlignments(false)
\r
155 //! BAM alignment data structure
\r
156 struct BamAlignment {
\r
158 // Queries against alignment flag
\r
160 //! Returns true if this read is a PCR duplicate (determined by external app)
\r
161 bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
\r
162 //! Returns true if this read failed quality control (determined by external app)
\r
163 bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
\r
164 //! Returns true if alignment is first mate on read
\r
165 bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
\r
166 //! Returns true if alignment is mapped
\r
167 bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
\r
168 //! Returns true if alignment's mate is mapped
\r
169 bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
\r
170 //! Returns true if alignment's mate mapped to reverse strand
\r
171 bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
\r
172 //! Returns true if alignment part of paired-end read
\r
173 bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
\r
174 //! Returns true if this position is primary alignment (determined by external app)
\r
175 bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
\r
176 //! Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)
\r
177 bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
\r
178 //! Returns true if alignment mapped to reverse strand
\r
179 bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
\r
180 //! Returns true if alignment is second mate on read
\r
181 bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
\r
185 \brief Get alignment's read group text.
\r
187 Assigns read group text to readGroup.
\r
189 \return True if read group data successfully retrieved.
\r
191 bool GetReadGroup(std::string& readGroup) const {
\r
193 if ( TagData.empty() ) { return false; }
\r
195 // localize the tag data
\r
196 char* pTagData = (char*)TagData.data();
\r
197 const unsigned int tagDataLen = TagData.size();
\r
198 unsigned int numBytesParsed = 0;
\r
200 bool foundReadGroupTag = false;
\r
201 while( numBytesParsed < tagDataLen ) {
\r
203 const char* pTagType = pTagData;
\r
204 const char* pTagStorageType = pTagData + 2;
\r
206 numBytesParsed += 3;
\r
208 // check the current tag
\r
209 if ( strncmp(pTagType, "RG", 2) == 0 ) {
\r
210 foundReadGroupTag = true;
\r
214 // get the storage class and find the next tag
\r
215 SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );
\r
218 // return if the read group tag was not present
\r
219 if ( !foundReadGroupTag ) { return false; }
\r
221 // assign the read group
\r
222 const unsigned int readGroupLen = strlen(pTagData);
\r
223 readGroup.resize(readGroupLen);
\r
224 memcpy( (char*)readGroup.data(), pTagData, readGroupLen );
\r
229 // skips to the next tag
\r
230 static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
\r
231 switch(storageType) {
\r
243 numBytesParsed += 2;
\r
249 numBytesParsed += 4;
\r
262 printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
\r
269 std::string Name; //!< Read name
\r
270 unsigned int Length; //!< Query length
\r
271 std::string QueryBases; //!< 'Original' sequence (as reported from sequencing machine)
\r
272 std::string AlignedBases; //!< 'Aligned' sequence (includes any indels, padding, clipping)
\r
273 std::string Qualities; //!< FASTQ qualities (ASCII characters, not numeric values)
\r
274 std::string TagData; //!< Tag data (accessor methods will pull the requested information out)
\r
275 unsigned int RefID; //!< ID number for reference sequence
\r
276 unsigned int Position; //!< Position (0-based) where alignment starts
\r
277 unsigned int Bin; //!< Bin in BAM file where this alignment resides
\r
278 unsigned int MapQuality; //!< Mapping quality score
\r
279 unsigned int AlignmentFlag; //!< Alignment bit-flag - see Is<something>() methods for available queries
\r
280 std::vector<CigarOp> CigarData; //!< CIGAR operations for this alignment
\r
281 unsigned int MateRefID; //!< ID number for reference sequence where alignment's mate was aligned
\r
282 unsigned int MatePosition; //!< Position (0-based) where alignment's mate starts
\r
283 unsigned int InsertSize; //!< Mate-pair insert size
\r
285 // Alignment flag query constants
\r
301 // ----------------------------------------------------------------
\r
306 \brief Vector of RefData objects
\r
308 typedef std::vector<RefData> RefVector;
\r
311 \typedef BamAlignmentVector
\r
312 \brief Vector of BamAlignments
\r
314 typedef std::vector< BamAlignment > BamAlignmentVector;
\r
317 // ----------------------------------------------------------------
\r
318 // Typedefs (internal - can exclude from main documentation)
\r
320 //Offsets for linear indexing
\r
321 typedef std::vector<uint64_t> LinearOffsetVector;
\r
323 // Alignment 'chunk' boundaries
\r
324 typedef std::pair<uint64_t, uint64_t> ChunkPair;
\r
325 // Vector of alignment 'chunks'
\r
326 typedef std::vector<ChunkPair> ChunkVector;
\r
328 // BAM bin contains a bin ID & a vector of alignment 'chunks'
\r
329 typedef std::pair<uint32_t, ChunkVector*> BamBin;
\r
330 // Vector of BAM bins
\r
331 typedef std::vector<BamBin> BinVector;
\r
333 // Reference sequence index data
\r
334 typedef std::pair<BinVector*, LinearOffsetVector*> RefIndex;
\r
336 // Full BAM file index data structure
\r
337 typedef std::vector<RefIndex*> BamIndex;
\r
338 // ----------------------------------------------------------------
\r