]> git.donarmstrong.com Git - bamtools.git/blob - BamAlignment.h
c69229dd6d9ec292b3ae433358fef12c887534eb
[bamtools.git] / BamAlignment.h
1 // BamAlignment.h
2
3 // Derek Barnett
4 // Marth Lab, Boston College
5 // Last modified: 20 March 2009
6
7 #ifndef BAMALIGNMENT_H
8 #define BAMALIGNMENT_H
9
10 #include <string.h>
11 #include <stdlib.h>
12
13 #ifdef WIN32
14 typedef char                 int8_t;
15 typedef unsigned char       uint8_t;
16 typedef short               int16_t;
17 typedef unsigned short     uint16_t;
18 typedef int                 int32_t;
19 typedef unsigned int       uint32_t;
20 typedef long long           int64_t;
21 typedef unsigned long long uint64_t;
22 #else
23 #include <stdint.h>
24 #endif
25
26 // C++ includes
27 #include <string>
28 using std::string;
29
30 #include <vector>
31 using std::vector;
32
33 struct CigarOp {
34         uint32_t Length;
35         char     Type;
36 };
37
38 struct RefData {
39         string       RefName;
40         unsigned int RefLength;
41         bool         RefHasAlignments;
42
43         // constructor
44         RefData(void)
45                 : RefLength(0)
46                 , RefHasAlignments(false)
47         { }
48 };
49
50 typedef vector<RefData> RefVector;
51
52 struct BamAlignment {
53
54         // queries against alignment flag - see below for further detail
55 public:
56         bool IsPaired(void) const            { return ( (AlignmentFlag & PAIRED)        != 0 ); }
57         bool IsProperPair(void) const        { return ( (AlignmentFlag & PROPER_PAIR)   != 0 ); }
58         bool IsMapped(void) const            { return ( (AlignmentFlag & UNMAPPED)      == 0 ); }
59         bool IsMateMapped(void) const        { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
60         bool IsReverseStrand(void) const     { return ( (AlignmentFlag & REVERSE)       != 0 ); }
61         bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }
62         bool IsFirstMate(void) const         { return ( (AlignmentFlag & READ_1)        != 0 ); }
63         bool IsSecondMate(void) const        { return ( (AlignmentFlag & READ_2)        != 0 ); }
64         bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY)     == 0 ); }
65         bool IsFailedQC(void) const          { return ( (AlignmentFlag & QC_FAILED)     != 0 ); }
66         bool IsDuplicate(void) const         { return ( (AlignmentFlag & DUPLICATE)     != 0 ); }
67
68         // returns true and assigns the read group if present in the tag data
69         bool GetReadGroup(string& readGroup) const {
70
71                 if(TagData.empty()) return false;
72
73                 // localize the tag data
74                 char* pTagData = (char*)TagData.data();
75                 const unsigned int tagDataLen = TagData.size();
76                 unsigned int numBytesParsed = 0;
77
78                 bool foundReadGroupTag = false;
79                 while(numBytesParsed < tagDataLen) {
80
81                         const char* pTagType = pTagData;
82                         const char* pTagStorageType = pTagData + 2;
83                         pTagData       += 3;
84                         numBytesParsed += 3;
85
86                         // check the current tag
87                         if(strncmp(pTagType, "RG", 2) == 0) {
88                                 foundReadGroupTag = true;
89                                 break;
90                         }
91
92                         // get the storage class and find the next tag
93                         SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed);
94                 }
95
96                 // return if the read group tag was not present
97                 if(!foundReadGroupTag) return false;
98
99                 // assign the read group
100                 const unsigned int readGroupLen = strlen(pTagData);
101                 readGroup.resize(readGroupLen);
102                 memcpy((char*)readGroup.data(), pTagData, readGroupLen);
103                 return true;
104         }
105
106         // skips to the next tag
107         static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
108                 switch(storageType) {
109                         case 'A':
110                         case 'c':
111                         case 'C':
112                                 numBytesParsed++;
113                                 pTagData++;
114                                 break;
115                         case 's':
116                         case 'S':
117                         case 'f':
118                                 numBytesParsed += 2;
119                                 pTagData       += 2;
120                                 break;
121                         case 'i':
122                         case 'I':
123                                 numBytesParsed += 4;
124                                 pTagData       += 4;
125                                 break;
126                         case 'Z':
127                         case 'H':
128                                 while(*pTagData) {
129                                         numBytesParsed++;
130                                         pTagData++;
131                                 }
132                                 break;
133                         default:
134                                 printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
135                                 exit(1);
136                 }
137         }
138
139         // data members
140 public:
141         string       Name;           // read name
142         unsigned int Length;         // query length
143         string       QueryBases;     // original sequence ( produced from machine )
144         string       AlignedBases;   // aligned sequence ( with indels ) 
145         string       Qualities;      // FASTQ qualities ( still in ASCII characters )
146         string       TagData;        // contains the tag data (accessor methods will pull the requested information out)
147         unsigned int RefID;          // ID for reference sequence
148         unsigned int Position;       // position on reference sequence where alignment starts
149         unsigned int Bin;            // bin in BAM file where this alignment resides
150         unsigned int MapQuality;     // mapping quality 
151         unsigned int AlignmentFlag;  // see above for available queries
152         vector<CigarOp> CigarData;   // vector of CIGAR operations (length & type) )
153         unsigned int MateRefID;      // ID for reference sequence that mate was aligned to
154         unsigned int MatePosition;   // position that mate was aligned to
155         unsigned int InsertSize;     // mate pair insert size
156
157         // alignment flag query constants
158 private:
159         enum { PAIRED        = 1,               // Alignment comes from paired-end data
160                 PROPER_PAIR   = 2,              // Alignment passed paired-end resolution
161                 UNMAPPED      = 4,              // Read is unmapped
162                 MATE_UNMAPPED = 8,              // Mate is unmapped
163                 REVERSE       = 16,             // Read is on reverse strand
164                 MATE_REVERSE  = 32,             // Mate is on reverse strand
165                 READ_1        = 64,             // This alignment is mate 1 of pair
166                 READ_2        = 128,            // This alignment is mate 2 of pair
167                 SECONDARY     = 256,            // This alignment is not the primary (best) alignment for read
168                 QC_FAILED     = 512,            // Read did not pass prior quality control steps
169                 DUPLICATE     = 1024            // Read is PCR duplicate
170         };
171 };
172
173 // commonly used vector in this library
174 typedef vector< BamAlignment > BamAlignmentVector;
175
176 #endif /* BAMALIGNMENT_H */