]> git.donarmstrong.com Git - bamtools.git/blob - BamAux.h
Changed BAM_CORE_SIZE to unsigned int to remove signed comparison warning in BamReade...
[bamtools.git] / BamAux.h
1 // ***************************************************************************\r
2 // BamAux.h (c) 2009 Derek Barnett, Michael Strömberg\r
3 // Marth Lab, Department of Biology, Boston College\r
4 // All rights reserved.\r
5 // ---------------------------------------------------------------------------\r
6 // Last modified: 24 June 2009 (DB)\r
7 // ---------------------------------------------------------------------------\r
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad\r
9 // Institute.\r
10 // ---------------------------------------------------------------------------\r
11 // Defines common constants, typedefs, & data structures for BamTools.\r
12 // ***************************************************************************\r
13 \r
14 /*! \r
15         \file BamAux.h\r
16         \brief BamTools constants, typedefs, & data structures\r
17 */\r
18 \r
19 #pragma once\r
20 \r
21 // C++ includes\r
22 #include <exception>\r
23 #include <string>\r
24 #include <utility>\r
25 #include <vector>\r
26 \r
27 // C includes\r
28 #include <cstdio>\r
29 #include <cstdlib>\r
30 #include <cstring>\r
31 \r
32 // Platform-specific type definitions\r
33 #ifdef WIN32\r
34         typedef char                 int8_t;\r
35         typedef unsigned char       uint8_t;\r
36         typedef short               int16_t;\r
37         typedef unsigned short     uint16_t;\r
38         typedef int                 int32_t;\r
39         typedef unsigned int       uint32_t;\r
40         typedef long long           int64_t;\r
41         typedef unsigned long long uint64_t;\r
42 #else\r
43         #include <stdint.h>\r
44 #endif\r
45 \r
46 //! \namespace BamTools\r
47 namespace BamTools {\r
48 \r
49         //! \cond\r
50         // --------------------------------------------------------------------------------------\r
51         // This section is purely internal and can be excluded from main generated documentation.\r
52         \r
53         // zlib constants\r
54         const int GZIP_ID1   = 31;\r
55         const int GZIP_ID2   = 139;\r
56         const int CM_DEFLATE = 8;\r
57         const int FLG_FEXTRA = 4;\r
58         const int OS_UNKNOWN = 255;\r
59         const int BGZF_XLEN  = 6;\r
60         const int BGZF_ID1   = 66;\r
61         const int BGZF_ID2   = 67;\r
62         const int BGZF_LEN   = 2;\r
63         const int GZIP_WINDOW_BITS = -15;\r
64         const int Z_DEFAULT_MEM_LEVEL = 8;\r
65 \r
66         // BZGF constants\r
67         const int BLOCK_HEADER_LENGTH = 18;\r
68         const int BLOCK_FOOTER_LENGTH = 8;\r
69         const int MAX_BLOCK_SIZE      = 65536;\r
70         const int DEFAULT_BLOCK_SIZE  = 65536;\r
71 \r
72         // BAM constants\r
73         const unsigned int BAM_CORE_SIZE = 32;\r
74         const int BAM_CMATCH      = 0;\r
75         const int BAM_CINS        = 1;\r
76         const int BAM_CDEL        = 2;\r
77         const int BAM_CREF_SKIP   = 3;\r
78         const int BAM_CSOFT_CLIP  = 4;\r
79         const int BAM_CHARD_CLIP  = 5;\r
80         const int BAM_CPAD        = 6;\r
81         const int BAM_CIGAR_SHIFT = 4;\r
82         const int BAM_CIGAR_MASK  = ((1 << BAM_CIGAR_SHIFT) - 1);\r
83 \r
84         // BAM index constants\r
85         const int MAX_BIN           = 37450;    // =(8^6-1)/7+1\r
86         const int BAM_MIN_CHUNK_GAP = 32768;\r
87         const int BAM_LIDX_SHIFT    = 14;\r
88 \r
89         // Explicit variable sizes\r
90         const int SIZEOF_INT = 4;\r
91         \r
92         struct BgzfData {\r
93                 unsigned int UncompressedBlockSize;\r
94                 unsigned int CompressedBlockSize;\r
95                 unsigned int BlockLength;\r
96                 unsigned int BlockOffset;\r
97                 uint64_t BlockAddress;\r
98                 bool     IsOpen;\r
99                 FILE*    Stream;\r
100                 char*    UncompressedBlock;\r
101                 char*    CompressedBlock;\r
102                 \r
103                 // constructor\r
104                 BgzfData(void)\r
105                         : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
106                         , CompressedBlockSize(MAX_BLOCK_SIZE)\r
107                         , BlockLength(0)\r
108                         , BlockOffset(0)\r
109                         , BlockAddress(0)\r
110                         , IsOpen(false)\r
111                         , Stream(NULL)\r
112                         , UncompressedBlock(NULL)\r
113                         , CompressedBlock(NULL)\r
114                 {\r
115                         try {\r
116                                 CompressedBlock   = new char[CompressedBlockSize];\r
117                                 UncompressedBlock = new char[UncompressedBlockSize];\r
118                         } catch( std::bad_alloc& ba ) {\r
119                                 printf("ERROR: Unable to allocate memory for our BGZF object.\n");\r
120                                 exit(1);\r
121                         }\r
122                 }\r
123                 \r
124                 // destructor\r
125                 ~BgzfData(void) {\r
126                         if(CompressedBlock)   delete [] CompressedBlock;\r
127                         if(UncompressedBlock) delete [] UncompressedBlock;\r
128                 }\r
129         };\r
130         //! \endcond\r
131         \r
132         // --------------------------------------------------------------------------------------\r
133         // Data structures\r
134         \r
135         //! \brief Cigar operation data structure\r
136         struct CigarOp {\r
137                 char     Type;   //!< Operation type (MIDNSHP)\r
138                 uint32_t Length; //!< Operation length (number of bases)\r
139                    \r
140         };\r
141 \r
142         //! Reference sequence data structure\r
143         struct RefData {\r
144                 std::string  RefName;          //!< Name of reference sequence\r
145                 unsigned int RefLength;        //!< Length of reference sequence\r
146                 bool         RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence\r
147                 \r
148                 // constructor\r
149                 RefData(void)\r
150                         : RefLength(0)\r
151                         , RefHasAlignments(false)\r
152                 { }\r
153         };\r
154 \r
155         //! BAM alignment data structure\r
156         struct BamAlignment {\r
157                 \r
158                 // Queries against alignment flag\r
159                 public:\r
160                         //! Returns true if this read is a PCR duplicate (determined by external app)\r
161                         bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
162                         //! Returns true if this read failed quality control (determined by external app)\r
163                         bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }    \r
164                         //! Returns true if alignment is first mate on read\r
165                         bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
166                         //! Returns true if alignment is mapped                 \r
167                         bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
168                         //! Returns true if alignment's mate is mapped\r
169                         bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }      \r
170                         //! Returns true if alignment's mate mapped to reverse strand\r
171                         bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }\r
172                         //! Returns true if alignment part of paired-end read\r
173                         bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } \r
174                         //! Returns true if this position is primary alignment (determined by external app)\r
175                         bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY) == 0 ); }   \r
176                         //! Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)\r
177                         bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }        \r
178                         //! Returns true if alignment mapped to reverse strand\r
179                         bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } \r
180                         //! Returns true if alignment is second mate on read\r
181                         bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }     \r
182                 \r
183                 public:\r
184                         /*! \r
185                                 \brief Get alignment's read group text.\r
186                                 \r
187                                 Assigns read group text to readGroup.\r
188                                 \r
189                                 \return True if read group data successfully retrieved.\r
190                         */\r
191                         bool GetReadGroup(std::string& readGroup) const {\r
192                                 \r
193                                 if ( TagData.empty() ) { return false; }\r
194                                 \r
195                                 // localize the tag data\r
196                                 char* pTagData = (char*)TagData.data();\r
197                                 const unsigned int tagDataLen = TagData.size();\r
198                                 unsigned int numBytesParsed = 0;\r
199                                 \r
200                                 bool foundReadGroupTag = false;\r
201                                 while( numBytesParsed < tagDataLen ) {\r
202                                         \r
203                                         const char* pTagType = pTagData;\r
204                                         const char* pTagStorageType = pTagData + 2;\r
205                                         pTagData       += 3;\r
206                                         numBytesParsed += 3;\r
207                                         \r
208                                         // check the current tag\r
209                                         if ( strncmp(pTagType, "RG", 2) == 0 ) {\r
210                                                 foundReadGroupTag = true;\r
211                                                 break;\r
212                                         }\r
213                                         \r
214                                         // get the storage class and find the next tag\r
215                                         SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );\r
216                                 }\r
217                                 \r
218                                 // return if the read group tag was not present\r
219                                 if ( !foundReadGroupTag ) { return false; }\r
220                                 \r
221                                 // assign the read group\r
222                                 const unsigned int readGroupLen = strlen(pTagData);\r
223                                 readGroup.resize(readGroupLen);\r
224                                 memcpy( (char*)readGroup.data(), pTagData, readGroupLen );\r
225                                 return true;\r
226                         }\r
227                 \r
228                 private:\r
229                         // skips to the next tag\r
230                         static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
231                                 switch(storageType) {\r
232                                         \r
233                                         case 'A':\r
234                                         case 'c':\r
235                                         case 'C':\r
236                                                 ++numBytesParsed;\r
237                                                 ++pTagData;\r
238                                                 break;\r
239                                                 \r
240                                         case 's':\r
241                                         case 'S':\r
242                                         case 'f':\r
243                                                 numBytesParsed += 2;\r
244                                                 pTagData       += 2;\r
245                                                 break;\r
246                                                 \r
247                                         case 'i':\r
248                                         case 'I':\r
249                                                 numBytesParsed += 4;\r
250                                                 pTagData       += 4;\r
251                                                 break;\r
252                                                 \r
253                                         case 'Z':\r
254                                         case 'H':\r
255                                                 while(*pTagData) {\r
256                                                         ++numBytesParsed;\r
257                                                         ++pTagData;\r
258                                                 }\r
259                                                 break;\r
260                                                 \r
261                                         default:\r
262                                                 printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);\r
263                                                 exit(1);\r
264                                 }\r
265                         }\r
266                 \r
267                 // Data members\r
268                 public:\r
269                         std::string  Name;              //!< Read name\r
270                         unsigned int Length;            //!< Query length\r
271                         std::string  QueryBases;        //!< 'Original' sequence (as reported from sequencing machine)\r
272                         std::string  AlignedBases;      //!< 'Aligned' sequence (includes any indels, padding, clipping) \r
273                         std::string  Qualities;         //!< FASTQ qualities (ASCII characters, not numeric values)\r
274                         std::string  TagData;           //!< Tag data (accessor methods will pull the requested information out)\r
275                         unsigned int RefID;             //!< ID number for reference sequence\r
276                         unsigned int Position;          //!< Position (0-based) where alignment starts \r
277                         unsigned int Bin;               //!< Bin in BAM file where this alignment resides\r
278                         unsigned int MapQuality;        //!< Mapping quality score \r
279                         unsigned int AlignmentFlag;     //!< Alignment bit-flag - see Is<something>() methods for available queries\r
280                         std::vector<CigarOp> CigarData; //!< CIGAR operations for this alignment\r
281                         unsigned int MateRefID;         //!< ID number for reference sequence where alignment's mate was aligned\r
282                         unsigned int MatePosition;      //!< Position (0-based) where alignment's mate starts\r
283                         unsigned int InsertSize;        //!< Mate-pair insert size\r
284                 \r
285                 // Alignment flag query constants\r
286                 private:\r
287                         enum { PAIRED        = 1,\r
288                                    PROPER_PAIR   = 2,\r
289                                    UNMAPPED      = 4,\r
290                                    MATE_UNMAPPED = 8,\r
291                                    REVERSE       = 16,\r
292                                    MATE_REVERSE  = 32,\r
293                                    READ_1        = 64,\r
294                                    READ_2        = 128,\r
295                                    SECONDARY     = 256,\r
296                                    QC_FAILED     = 512,\r
297                                    DUPLICATE     = 1024\r
298                              };\r
299         };\r
300 \r
301         // ----------------------------------------------------------------\r
302         // Typedefs\r
303         \r
304         /*!\r
305                 \typedef RefVector\r
306                 \brief Vector of RefData objects\r
307         */\r
308         typedef std::vector<RefData> RefVector;\r
309         \r
310         /*! \r
311                 \typedef BamAlignmentVector\r
312                 \brief Vector of BamAlignments\r
313         */\r
314         typedef std::vector< BamAlignment > BamAlignmentVector;\r
315         \r
316         //! \cond\r
317         // ----------------------------------------------------------------\r
318         // Typedefs (internal - can exclude from main documentation)\r
319         \r
320         //Offsets for linear indexing\r
321         typedef std::vector<uint64_t> LinearOffsetVector;\r
322 \r
323         // Alignment 'chunk' boundaries\r
324         typedef std::pair<uint64_t, uint64_t> ChunkPair;\r
325         // Vector of alignment 'chunks'\r
326         typedef std::vector<ChunkPair> ChunkVector;\r
327 \r
328         // BAM bin contains a bin ID & a vector of alignment 'chunks'\r
329         typedef std::pair<uint32_t, ChunkVector*> BamBin;\r
330         // Vector of BAM bins\r
331         typedef std::vector<BamBin> BinVector;\r
332 \r
333         // Reference sequence index data\r
334         typedef std::pair<BinVector*, LinearOffsetVector*> RefIndex;\r
335 \r
336         // Full BAM file index data structure \r
337         typedef std::vector<RefIndex*> BamIndex;\r
338         // ----------------------------------------------------------------\r
339         //! \endcond\r
340 }\r