1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 7 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
13 #include <api/api_global.h>
14 #include <api/BamAux.h>
15 #include <api/BamConstants.h>
23 // forward declaration of BamAlignment's friend classes
25 class BamReaderPrivate;
26 class BamWriterPrivate;
27 } // namespace Internal
29 // BamAlignment data structure
30 struct API_EXPORT BamAlignment {
32 // constructors & destructor
35 BamAlignment(const BamAlignment& other);
38 // queries against alignment flags
40 bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
41 bool IsFailedQC(void) const; // returns true if this read failed quality control
42 bool IsFirstMate(void) const; // returns true if alignment is first mate on read
43 bool IsMapped(void) const; // returns true if alignment is mapped
44 bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
45 bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
46 bool IsPaired(void) const; // returns true if alignment part of paired-end read
47 bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
48 bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
49 bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
50 bool IsSecondMate(void) const; // returns true if alignment is second mate on read
52 // manipulate alignment flags
54 void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
55 void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
56 void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
57 void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
58 void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
59 void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
60 void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
61 void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
62 void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
63 void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
64 void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
66 // legacy methods (consider deprecated, but still available)
67 void SetIsMateUnmapped(bool ok); // complement of using SetIsMateMapped()
68 void SetIsSecondaryAlignment(bool ok); // complement of using SetIsPrimaryAlignment()
69 void SetIsUnmapped(bool ok); // complement of using SetIsMapped()
71 // tag data access methods
75 template<typename T> bool AddTag(const std::string& tag, const std::string& type, const T& value);
76 template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values);
78 // edit (or append) tag
79 template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value);
80 template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values);
83 template<typename T> bool GetTag(const std::string& tag, T& destination) const;
84 template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const;
86 // retrieves the BAM type-code for requested tag
87 // (returns whether or not tag exists, and type-code is valid)
88 bool GetTagType(const std::string& tag, char& type) const;
90 // legacy methods (consider deprecated, but still available)
91 bool GetEditDistance(uint32_t& editDistance) const; // retrieves value of "NM" tag
92 bool GetReadGroup(std::string& readGroup) const; // retrieves value of "RG" tag
94 // returns true if alignment has a record for this tag name
95 bool HasTag(const std::string& tag) const;
98 void RemoveTag(const std::string& tag);
100 // additional methods
102 // populates alignment string fields
103 bool BuildCharData(void);
105 // calculates alignment end position
106 int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
108 // returns a description of the last error that occurred
109 std::string GetErrorString(void) const;
111 // public data fields
113 std::string Name; // read name
114 int32_t Length; // length of query sequence
115 std::string QueryBases; // 'original' sequence (as reported from sequencing machine)
116 std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping)
117 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
118 std::string TagData; // tag data (use provided methods to query/modify)
119 int32_t RefID; // ID number for reference sequence
120 int32_t Position; // position (0-based) where alignment starts
121 uint16_t Bin; // BAM (standard) index bin number for this alignment
122 uint16_t MapQuality; // mapping quality score
123 uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
124 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
125 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
126 int32_t MatePosition; // position (0-based) where alignment's mate starts
127 int32_t InsertSize; // mate-pair insert size
128 std::string Filename; // name of BAM file which this alignment comes from
131 // internal utility methods
133 bool FindTag(const std::string& tag,
135 const unsigned int& tagDataLength,
136 unsigned int& numBytesParsed) const;
137 bool IsValidSize(const std::string& tag,
138 const std::string& type) const;
139 void SetErrorString(const std::string& where,
140 const std::string& what) const;
141 bool SkipToNextTag(const char storageType,
143 unsigned int& numBytesParsed) const;
148 struct BamAlignmentSupportData {
151 std::string AllCharData;
152 uint32_t BlockLength;
153 uint32_t NumCigarOperations;
154 uint32_t QueryNameLength;
155 uint32_t QuerySequenceLength;
159 BamAlignmentSupportData(void)
161 , NumCigarOperations(0)
163 , QuerySequenceLength(0)
167 BamAlignmentSupportData SupportData;
168 friend class Internal::BamReaderPrivate;
169 friend class Internal::BamWriterPrivate;
171 mutable std::string ErrorString; // mutable to allow updates even in logically const methods
175 // ---------------------------------------------------------
176 // BamAlignment tag access methods
179 inline bool BamAlignment::AddTag(const std::string& tag,
180 const std::string& type,
183 // if char data not populated, do that first
184 if ( SupportData.HasCoreOnly )
187 // check tag/type size
188 if ( !IsValidSize(tag, type) ) {
189 // TODO: set error string?
193 // check that storage type code is OK for T
194 if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) ) {
195 // TODO: set error string?
199 // localize the tag data
200 char* pTagData = (char*)TagData.data();
201 const unsigned int tagDataLength = TagData.size();
202 unsigned int numBytesParsed = 0;
204 // if tag already exists, return false
205 // use EditTag explicitly instead
206 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
207 // TODO: set error string?
211 // otherwise, convert value to string
212 union { T value; char valueBuffer[sizeof(T)]; } un;
215 // copy original tag data to temp buffer
216 const std::string newTag = tag + type;
217 const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
218 RaiiBuffer originalTagData(newTagDataLength);
219 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
222 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
223 memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
225 // store temp buffer back in TagData
226 const char* newTagData = (const char*)originalTagData.Buffer;
227 TagData.assign(newTagData, newTagDataLength);
232 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
233 const std::string& type,
234 const std::string& value)
236 // if char data not populated, do that first
237 if ( SupportData.HasCoreOnly )
240 // check tag/type size
241 if ( !IsValidSize(tag, type) ) {
242 // TODO: set error string?
246 // check that storage type code is OK for string
247 if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) ) {
248 // TODO: set error string?
252 // localize the tag data
253 char* pTagData = (char*)TagData.data();
254 const unsigned int tagDataLength = TagData.size();
255 unsigned int numBytesParsed = 0;
257 // if tag already exists, return false
258 // use EditTag explicitly instead
259 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
260 // TODO: set error string?
264 // otherwise, copy tag data to temp buffer
265 const std::string newTag = tag + type + value;
266 const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
267 RaiiBuffer originalTagData(newTagDataLength);
268 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
270 // append newTag (removes original null-term, then appends newTag + null-term)
271 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
273 // store temp buffer back in TagData
274 const char* newTagData = (const char*)originalTagData.Buffer;
275 TagData.assign(newTagData, newTagDataLength);
280 inline bool BamAlignment::AddTag(const std::string& tag,
281 const std::vector<T>& values)
283 // if char data not populated, do that first
284 if ( SupportData.HasCoreOnly )
287 // check for valid tag name length
288 if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
291 // localize the tag data
292 char* pTagData = (char*)TagData.data();
293 const unsigned int tagDataLength = TagData.size();
294 unsigned int numBytesParsed = 0;
296 // if tag already exists, return false
297 // use EditTag explicitly instead
298 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
299 // TODO: set error string?
303 // build new tag's base information
304 char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
305 memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
306 newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
307 newTagBase[3] = TagTypeHelper<T>::TypeCode();
309 // add number of array elements to newTagBase
310 const int32_t numElements = values.size();
311 memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
313 // copy current TagData string to temp buffer, leaving room for new tag's contents
314 const size_t newTagDataLength = tagDataLength +
315 Constants::BAM_TAG_ARRAYBASE_SIZE +
316 numElements*sizeof(T);
317 RaiiBuffer originalTagData(newTagDataLength);
318 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
320 // write newTagBase (removes old null term)
321 strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
323 // add vector elements to tag
324 int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
325 for ( int i = 0 ; i < numElements; ++i ) {
326 const T& value = values.at(i);
327 memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
330 // store temp buffer back in TagData
331 const char* newTagData = (const char*)originalTagData.Buffer;
332 TagData.assign(newTagData, newTagDataLength);
337 inline bool BamAlignment::EditTag(const std::string& tag,
338 const std::string& type,
341 // if char data not populated, do that first
342 if ( SupportData.HasCoreOnly )
345 // remove existing tag if present, then append tag with new value
348 return AddTag(tag, type, value);
352 inline bool BamAlignment::EditTag(const std::string& tag,
353 const std::vector<T>& values)
355 // if char data not populated, do that first
356 if ( SupportData.HasCoreOnly )
359 // remove existing tag if present, then append tag with new values
362 return AddTag(tag, values);
366 inline bool BamAlignment::GetTag(const std::string& tag,
367 T& destination) const
369 // skip if alignment is core-only
370 if ( SupportData.HasCoreOnly ) {
371 // TODO: set error string?
375 // skip if no tags present
376 if ( TagData.empty() ) {
377 // TODO: set error string?
381 // localize the tag data
382 char* pTagData = (char*)TagData.data();
383 const unsigned int tagDataLength = TagData.size();
384 unsigned int numBytesParsed = 0;
386 // return failure if tag not found
387 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
388 // TODO: set error string?
393 const char type = *(pTagData - 1);
394 if ( !TagTypeHelper<T>::CanConvertFrom(type) ) {
395 // TODO: set error string ?
399 // determine data length
400 int destinationLength = 0;
404 case (Constants::BAM_TAG_TYPE_ASCII) :
405 case (Constants::BAM_TAG_TYPE_INT8) :
406 case (Constants::BAM_TAG_TYPE_UINT8) :
407 destinationLength = 1;
411 case (Constants::BAM_TAG_TYPE_INT16) :
412 case (Constants::BAM_TAG_TYPE_UINT16) :
413 destinationLength = 2;
417 case (Constants::BAM_TAG_TYPE_INT32) :
418 case (Constants::BAM_TAG_TYPE_UINT32) :
419 case (Constants::BAM_TAG_TYPE_FLOAT) :
420 destinationLength = 4;
423 // var-length types not supported for numeric destination
424 case (Constants::BAM_TAG_TYPE_STRING) :
425 case (Constants::BAM_TAG_TYPE_HEX) :
426 case (Constants::BAM_TAG_TYPE_ARRAY) :
427 SetErrorString("BamAlignment::GetTag",
428 "cannot store variable length tag data into a numeric destination");
431 // unrecognized tag type
433 const std::string message = std::string("invalid tag type: ") + type;
434 SetErrorString("BamAlignment::GetTag", message);
438 // store data in destination
440 memcpy(&destination, pTagData, destinationLength);
447 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
448 std::string& destination) const
450 // skip if alignment is core-only
451 if ( SupportData.HasCoreOnly ) {
452 // TODO: set error string?
456 // skip if no tags present
457 if ( TagData.empty() ) {
458 // TODO: set error string?
462 // localize the tag data
463 char* pTagData = (char*)TagData.data();
464 const unsigned int tagDataLength = TagData.size();
465 unsigned int numBytesParsed = 0;
467 // return failure if tag not found
468 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
469 // TODO: set error string?
473 // otherwise copy data into destination
474 const unsigned int dataLength = strlen(pTagData);
476 destination.resize(dataLength);
477 memcpy( (char*)destination.data(), pTagData, dataLength );
483 // retrieves "binary-array" tag data
485 inline bool BamAlignment::GetTag(const std::string& tag,
486 std::vector<T>& destination) const
488 // skip if alignment is core-only
489 if ( SupportData.HasCoreOnly ) {
490 // TODO: set error string?
494 // skip if no tags present
495 if ( TagData.empty() ) {
496 // TODO: set error string?
500 // localize the tag data
501 char* pTagData = (char*)TagData.data();
502 const unsigned int tagDataLength = TagData.size();
503 unsigned int numBytesParsed = 0;
505 // return false if tag not found
506 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
507 // TODO: set error string?
511 // check that tag is array type
512 const char tagType = *(pTagData - 1);
513 if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
514 SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
518 // fetch element type
519 const char elementType = *pTagData;
520 if ( !TagTypeHelper<T>::CanConvertFrom(elementType) ) {
521 // TODO: set error string ?
526 // calculate length of each element in tag's array
527 int elementLength = 0;
528 switch ( elementType ) {
529 case (Constants::BAM_TAG_TYPE_ASCII) :
530 case (Constants::BAM_TAG_TYPE_INT8) :
531 case (Constants::BAM_TAG_TYPE_UINT8) :
532 elementLength = sizeof(uint8_t);
535 case (Constants::BAM_TAG_TYPE_INT16) :
536 case (Constants::BAM_TAG_TYPE_UINT16) :
537 elementLength = sizeof(uint16_t);
540 case (Constants::BAM_TAG_TYPE_INT32) :
541 case (Constants::BAM_TAG_TYPE_UINT32) :
542 case (Constants::BAM_TAG_TYPE_FLOAT) :
543 elementLength = sizeof(uint32_t);
546 // var-length types not supported for numeric destination
547 case (Constants::BAM_TAG_TYPE_STRING) :
548 case (Constants::BAM_TAG_TYPE_HEX) :
549 case (Constants::BAM_TAG_TYPE_ARRAY) :
550 SetErrorString("BamAlignment::GetTag",
551 "invalid array data, variable-length elements are not allowed");
556 const std::string message = std::string("invalid array element type: ") + elementType;
557 SetErrorString("BamAlignment::GetTag", message);
561 // get number of elements
563 memcpy(&numElements, pTagData, sizeof(int32_t));
566 destination.reserve(numElements);
570 for ( int i = 0 ; i < numElements; ++i ) {
571 memcpy(&value, pTagData, sizeof(T));
572 pTagData += sizeof(T);
573 destination.push_back(value);
580 typedef std::vector<BamAlignment> BamAlignmentVector;
582 } // namespace BamTools
584 #endif // BAMALIGNMENT_H