1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 4 December 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
13 #include "api/api_global.h"
14 #include "api/BamAux.h"
15 #include "api/BamConstants.h"
24 // forward declaration of BamAlignment's "friends"
26 class BamReaderPrivate;
27 class BamWriterPrivate;
28 } // namespace Internal
31 // BamAlignment data structure
32 struct API_EXPORT BamAlignment {
34 // constructors & destructor
37 BamAlignment(const BamAlignment& other);
40 // queries against alignment flags
42 bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
43 bool IsFailedQC(void) const; // returns true if this read failed quality control
44 bool IsFirstMate(void) const; // returns true if alignment is first mate on read
45 bool IsMapped(void) const; // returns true if alignment is mapped
46 bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
47 bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
48 bool IsPaired(void) const; // returns true if alignment part of paired-end read
49 bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
50 bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
51 bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
52 bool IsSecondMate(void) const; // returns true if alignment is second mate on read
54 // manipulate alignment flags
56 void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
57 void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
58 void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
59 void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
60 void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
61 void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
62 void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
63 void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
64 void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
65 void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
66 void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
68 // tag data access methods
72 template<typename T> bool AddTag(const std::string& tag, const std::string& type, const T& value);
73 template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values);
75 // edit (or append) tag
76 template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value);
77 template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values);
80 template<typename T> bool GetTag(const std::string& tag, T& destination) const;
81 template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const;
83 // retrieves all current tag names
84 std::vector<std::string> GetTagNames(void) const;
86 // retrieves the SAM/BAM type-code for requested tag name
87 bool GetTagType(const std::string& tag, char& type) const;
89 // retrieves the SAM/BAM type-code for the data elements in an array tag
90 bool GetArrayTagType(const std::string& tag, char& type) const;
92 // returns true if alignment has a record for this tag name
93 bool HasTag(const std::string& tag) const;
96 void RemoveTag(const std::string& tag);
100 // populates alignment string fields
101 bool BuildCharData(void);
103 // calculates alignment end position
104 int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
106 // returns a description of the last error that occurred
107 std::string GetErrorString(void) const;
109 // retrieves the size, read locations and reference locations of soft-clip operations
110 bool GetSoftClips(std::vector<int>& clipSizes,
111 std::vector<int>& readPositions,
112 std::vector<int>& genomePositions,
113 bool usePadded = false) const;
115 // public data fields
117 std::string Name; // read name
118 int32_t Length; // length of query sequence
119 std::string QueryBases; // 'original' sequence (as reported from sequencing machine)
120 std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping)
121 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
122 std::string TagData; // tag data (use provided methods to query/modify)
123 int32_t RefID; // ID number for reference sequence
124 int32_t Position; // position (0-based) where alignment starts
125 uint16_t Bin; // BAM (standard) index bin number for this alignment
126 uint16_t MapQuality; // mapping quality score
127 uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
128 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
129 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
130 int32_t MatePosition; // position (0-based) where alignment's mate starts
131 int32_t InsertSize; // mate-pair insert size
132 std::string Filename; // name of BAM file which this alignment comes from
135 // internal utility methods
137 bool FindTag(const std::string& tag,
139 const unsigned int& tagDataLength,
140 unsigned int& numBytesParsed) const;
141 bool IsValidSize(const std::string& tag, const std::string& type) const;
142 void SetErrorString(const std::string& where, const std::string& what) const;
143 bool SkipToNextTag(const char storageType,
145 unsigned int& numBytesParsed) const;
150 struct BamAlignmentSupportData {
153 std::string AllCharData;
154 uint32_t BlockLength;
155 uint32_t NumCigarOperations;
156 uint32_t QueryNameLength;
157 uint32_t QuerySequenceLength;
161 BamAlignmentSupportData(void)
163 , NumCigarOperations(0)
165 , QuerySequenceLength(0)
169 BamAlignmentSupportData SupportData;
170 friend class Internal::BamReaderPrivate;
171 friend class Internal::BamWriterPrivate;
173 mutable std::string ErrorString; // mutable to allow updates even in logically const methods
177 // ---------------------------------------------------------
178 // BamAlignment tag access methods
180 /*! \fn bool AddTag(const std::string& tag, const std::string& type, const T& value)
181 \brief Adds a field to the BAM tags.
183 Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
185 \param[in] tag 2-character tag name
186 \param[in] type 1-character tag type
187 \param[in] value data to store
188 \return \c true if the \b new tag was added successfully
189 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
192 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value) {
194 // if char data not populated, do that first
195 if ( SupportData.HasCoreOnly )
198 // check tag/type size
199 if ( !IsValidSize(tag, type) ) {
200 // TODO: set error string?
204 // check that storage type code is OK for T
205 if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) ) {
206 // TODO: set error string?
210 // localize the tag data
211 char* pTagData = (char*)TagData.data();
212 const unsigned int tagDataLength = TagData.size();
213 unsigned int numBytesParsed = 0;
215 // if tag already exists, return false
216 // use EditTag explicitly instead
217 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
218 // TODO: set error string?
222 // otherwise, convert value to string
223 union { T value; char valueBuffer[sizeof(T)]; } un;
226 // copy original tag data to temp buffer
227 const std::string newTag = tag + type;
228 const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
229 RaiiBuffer originalTagData(newTagDataLength);
230 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
233 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
234 memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
236 // store temp buffer back in TagData
237 const char* newTagData = (const char*)originalTagData.Buffer;
238 TagData.assign(newTagData, newTagDataLength);
243 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
244 const std::string& type,
245 const std::string& value)
247 // if char data not populated, do that first
248 if ( SupportData.HasCoreOnly )
251 // check tag/type size
252 if ( !IsValidSize(tag, type) ) {
253 // TODO: set error string?
257 // check that storage type code is OK for string
258 if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) ) {
259 // TODO: set error string?
263 // localize the tag data
264 char* pTagData = (char*)TagData.data();
265 const unsigned int tagDataLength = TagData.size();
266 unsigned int numBytesParsed = 0;
268 // if tag already exists, return false
269 // use EditTag explicitly instead
270 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
271 // TODO: set error string?
275 // otherwise, copy tag data to temp buffer
276 const std::string newTag = tag + type + value;
277 const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
278 RaiiBuffer originalTagData(newTagDataLength);
279 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
281 // append newTag (removes original null-term, then appends newTag + null-term)
282 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
284 // store temp buffer back in TagData
285 const char* newTagData = (const char*)originalTagData.Buffer;
286 TagData.assign(newTagData, newTagDataLength);
290 /*! \fn template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values)
291 \brief Adds a numeric array field to the BAM tags.
293 Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
295 \param[in] tag 2-character tag name
296 \param[in] values vector of data values to store
297 \return \c true if the \b new tag was added successfully
298 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
301 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values) {
303 // if char data not populated, do that first
304 if ( SupportData.HasCoreOnly )
307 // check for valid tag name length
308 if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
311 // localize the tag data
312 char* pTagData = (char*)TagData.data();
313 const unsigned int tagDataLength = TagData.size();
314 unsigned int numBytesParsed = 0;
316 // if tag already exists, return false
317 // use EditTag explicitly instead
318 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
319 // TODO: set error string?
323 // build new tag's base information
324 char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
325 memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
326 newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
327 newTagBase[3] = TagTypeHelper<T>::TypeCode();
329 // add number of array elements to newTagBase
330 const int32_t numElements = values.size();
331 memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
333 // copy current TagData string to temp buffer, leaving room for new tag's contents
334 const size_t newTagDataLength = tagDataLength +
335 Constants::BAM_TAG_ARRAYBASE_SIZE +
336 numElements*sizeof(T);
337 RaiiBuffer originalTagData(newTagDataLength);
338 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
340 // write newTagBase (removes old null term)
341 strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
343 // add vector elements to tag
344 int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
345 for ( int i = 0 ; i < numElements; ++i ) {
346 const T& value = values.at(i);
347 memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
350 // store temp buffer back in TagData
351 const char* newTagData = (const char*)originalTagData.Buffer;
352 TagData.assign(newTagData, newTagDataLength);
356 /*! \fn template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value)
357 \brief Edits a BAM tag field.
359 If \a tag does not exist, a new entry is created.
361 \param tag[in] 2-character tag name
362 \param type[in] 1-character tag type (must be "Z" or "H")
363 \param value[in] new data value
365 \return \c true if the tag was modified/created successfully
367 \sa BamAlignment::RemoveTag()
368 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
371 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value) {
373 // if char data not populated, do that first
374 if ( SupportData.HasCoreOnly )
377 // remove existing tag if present, then append tag with new value
380 return AddTag(tag, type, value);
383 /*! \fn template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values)
384 \brief Edits a BAM tag field containing a numeric array.
386 If \a tag does not exist, a new entry is created.
388 \param tag[in] 2-character tag name
389 \param value[in] vector of data values
391 \return \c true if the tag was modified/created successfully
392 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
395 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values) {
397 // if char data not populated, do that first
398 if ( SupportData.HasCoreOnly )
401 // remove existing tag if present, then append tag with new values
404 return AddTag(tag, values);
408 /*! \fn template<typename T> bool GetTag(const std::string& tag, T& destination) const
409 \brief Retrieves the value associated with a BAM tag.
411 \param tag[in] 2-character tag name
412 \param destination[out] retrieved value
413 \return \c true if found
416 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const {
418 // skip if alignment is core-only
419 if ( SupportData.HasCoreOnly ) {
420 // TODO: set error string?
424 // skip if no tags present
425 if ( TagData.empty() ) {
426 // TODO: set error string?
430 // localize the tag data
431 char* pTagData = (char*)TagData.data();
432 const unsigned int tagDataLength = TagData.size();
433 unsigned int numBytesParsed = 0;
435 // return failure if tag not found
436 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
437 // TODO: set error string?
442 const char type = *(pTagData - 1);
443 if ( !TagTypeHelper<T>::CanConvertFrom(type) ) {
444 // TODO: set error string ?
448 // determine data length
449 int destinationLength = 0;
453 case (Constants::BAM_TAG_TYPE_ASCII) :
454 case (Constants::BAM_TAG_TYPE_INT8) :
455 case (Constants::BAM_TAG_TYPE_UINT8) :
456 destinationLength = 1;
460 case (Constants::BAM_TAG_TYPE_INT16) :
461 case (Constants::BAM_TAG_TYPE_UINT16) :
462 destinationLength = 2;
466 case (Constants::BAM_TAG_TYPE_INT32) :
467 case (Constants::BAM_TAG_TYPE_UINT32) :
468 case (Constants::BAM_TAG_TYPE_FLOAT) :
469 destinationLength = 4;
472 // var-length types not supported for numeric destination
473 case (Constants::BAM_TAG_TYPE_STRING) :
474 case (Constants::BAM_TAG_TYPE_HEX) :
475 case (Constants::BAM_TAG_TYPE_ARRAY) :
476 SetErrorString("BamAlignment::GetTag",
477 "cannot store variable length tag data into a numeric destination");
480 // unrecognized tag type
482 const std::string message = std::string("invalid tag type: ") + type;
483 SetErrorString("BamAlignment::GetTag", message);
487 // store data in destination
489 memcpy(&destination, pTagData, destinationLength);
496 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
497 std::string& destination) const
499 // skip if alignment is core-only
500 if ( SupportData.HasCoreOnly ) {
501 // TODO: set error string?
505 // skip if no tags present
506 if ( TagData.empty() ) {
507 // TODO: set error string?
511 // localize the tag data
512 char* pTagData = (char*)TagData.data();
513 const unsigned int tagDataLength = TagData.size();
514 unsigned int numBytesParsed = 0;
516 // return failure if tag not found
517 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
518 // TODO: set error string?
522 // otherwise copy data into destination
523 const unsigned int dataLength = strlen(pTagData);
525 destination.resize(dataLength);
526 memcpy( (char*)destination.data(), pTagData, dataLength );
532 /*! \fn template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const
533 \brief Retrieves the numeric array associated with a BAM tag.
535 \param tag[in] 2-character tag name
536 \param destination[out] retrieved values
537 \return \c true if found
540 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const {
542 // skip if alignment is core-only
543 if ( SupportData.HasCoreOnly ) {
544 // TODO: set error string?
548 // skip if no tags present
549 if ( TagData.empty() ) {
550 // TODO: set error string?
554 // localize the tag data
555 char* pTagData = (char*)TagData.data();
556 const unsigned int tagDataLength = TagData.size();
557 unsigned int numBytesParsed = 0;
559 // return false if tag not found
560 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
561 // TODO: set error string?
565 // check that tag is array type
566 const char tagType = *(pTagData - 1);
567 if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
568 SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
572 // fetch element type
573 const char elementType = *pTagData;
574 if ( !TagTypeHelper<T>::CanConvertFrom(elementType) ) {
575 // TODO: set error string ?
580 // calculate length of each element in tag's array
581 int elementLength = 0;
582 switch ( elementType ) {
583 case (Constants::BAM_TAG_TYPE_ASCII) :
584 case (Constants::BAM_TAG_TYPE_INT8) :
585 case (Constants::BAM_TAG_TYPE_UINT8) :
586 elementLength = sizeof(uint8_t);
589 case (Constants::BAM_TAG_TYPE_INT16) :
590 case (Constants::BAM_TAG_TYPE_UINT16) :
591 elementLength = sizeof(uint16_t);
594 case (Constants::BAM_TAG_TYPE_INT32) :
595 case (Constants::BAM_TAG_TYPE_UINT32) :
596 case (Constants::BAM_TAG_TYPE_FLOAT) :
597 elementLength = sizeof(uint32_t);
600 // var-length types not supported for numeric destination
601 case (Constants::BAM_TAG_TYPE_STRING) :
602 case (Constants::BAM_TAG_TYPE_HEX) :
603 case (Constants::BAM_TAG_TYPE_ARRAY) :
604 SetErrorString("BamAlignment::GetTag",
605 "invalid array data, variable-length elements are not allowed");
610 const std::string message = std::string("invalid array element type: ") + elementType;
611 SetErrorString("BamAlignment::GetTag", message);
615 // get number of elements
617 memcpy(&numElements, pTagData, sizeof(int32_t));
620 destination.reserve(numElements);
624 for ( int i = 0 ; i < numElements; ++i ) {
625 memcpy(&value, pTagData, sizeof(T));
626 pTagData += sizeof(T);
627 destination.push_back(value);
634 typedef std::vector<BamAlignment> BamAlignmentVector;
636 } // namespace BamTools
638 #endif // BAMALIGNMENT_H