1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 18 November 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
13 #include "api/api_global.h"
14 #include "api/BamAux.h"
15 #include "api/BamConstants.h"
24 // forward declaration of BamAlignment's "friends"
26 class BamReaderPrivate;
27 class BamWriterPrivate;
28 } // namespace Internal
31 // BamAlignment data structure
32 struct API_EXPORT BamAlignment {
34 // constructors & destructor
37 BamAlignment(const BamAlignment& other);
40 // queries against alignment flags
42 bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
43 bool IsFailedQC(void) const; // returns true if this read failed quality control
44 bool IsFirstMate(void) const; // returns true if alignment is first mate on read
45 bool IsMapped(void) const; // returns true if alignment is mapped
46 bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
47 bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
48 bool IsPaired(void) const; // returns true if alignment part of paired-end read
49 bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
50 bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
51 bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
52 bool IsSecondMate(void) const; // returns true if alignment is second mate on read
54 // manipulate alignment flags
56 void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
57 void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
58 void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
59 void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
60 void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
61 void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
62 void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
63 void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
64 void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
65 void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
66 void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
68 // tag data access methods
72 template<typename T> bool AddTag(const std::string& tag, const std::string& type, const T& value);
73 template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values);
75 // edit (or append) tag
76 template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value);
77 template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values);
80 template<typename T> bool GetTag(const std::string& tag, T& destination) const;
81 template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const;
83 // retrieves all current tag names
84 std::vector<std::string> GetTagNames(void) const;
86 // retrieves the SAM/BAM type-code for requested tag name
87 bool GetTagType(const std::string& tag, char& type) const;
89 // returns true if alignment has a record for this tag name
90 bool HasTag(const std::string& tag) const;
93 void RemoveTag(const std::string& tag);
97 // populates alignment string fields
98 bool BuildCharData(void);
100 // calculates alignment end position
101 int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
103 // returns a description of the last error that occurred
104 std::string GetErrorString(void) const;
106 // retrieves the size, read locations and reference locations of soft-clip operations
107 bool GetSoftClips(std::vector<int>& clipSizes,
108 std::vector<int>& readPositions,
109 std::vector<int>& genomePositions,
110 bool usePadded = false) const;
112 // public data fields
114 std::string Name; // read name
115 int32_t Length; // length of query sequence
116 std::string QueryBases; // 'original' sequence (as reported from sequencing machine)
117 std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping)
118 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
119 std::string TagData; // tag data (use provided methods to query/modify)
120 int32_t RefID; // ID number for reference sequence
121 int32_t Position; // position (0-based) where alignment starts
122 uint16_t Bin; // BAM (standard) index bin number for this alignment
123 uint16_t MapQuality; // mapping quality score
124 uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
125 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
126 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
127 int32_t MatePosition; // position (0-based) where alignment's mate starts
128 int32_t InsertSize; // mate-pair insert size
129 std::string Filename; // name of BAM file which this alignment comes from
132 // internal utility methods
134 bool FindTag(const std::string& tag,
136 const unsigned int& tagDataLength,
137 unsigned int& numBytesParsed) const;
138 bool IsValidSize(const std::string& tag, const std::string& type) const;
139 void SetErrorString(const std::string& where, const std::string& what) const;
140 bool SkipToNextTag(const char storageType,
142 unsigned int& numBytesParsed) const;
147 struct BamAlignmentSupportData {
150 std::string AllCharData;
151 uint32_t BlockLength;
152 uint32_t NumCigarOperations;
153 uint32_t QueryNameLength;
154 uint32_t QuerySequenceLength;
158 BamAlignmentSupportData(void)
160 , NumCigarOperations(0)
162 , QuerySequenceLength(0)
166 BamAlignmentSupportData SupportData;
167 friend class Internal::BamReaderPrivate;
168 friend class Internal::BamWriterPrivate;
170 mutable std::string ErrorString; // mutable to allow updates even in logically const methods
174 // ---------------------------------------------------------
175 // BamAlignment tag access methods
177 /*! \fn bool AddTag(const std::string& tag, const std::string& type, const T& value)
178 \brief Adds a field to the BAM tags.
180 Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
182 \param[in] tag 2-character tag name
183 \param[in] type 1-character tag type
184 \param[in] value data to store
185 \return \c true if the \b new tag was added successfully
186 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
189 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value) {
191 // if char data not populated, do that first
192 if ( SupportData.HasCoreOnly )
195 // check tag/type size
196 if ( !IsValidSize(tag, type) ) {
197 // TODO: set error string?
201 // check that storage type code is OK for T
202 if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) ) {
203 // TODO: set error string?
207 // localize the tag data
208 char* pTagData = (char*)TagData.data();
209 const unsigned int tagDataLength = TagData.size();
210 unsigned int numBytesParsed = 0;
212 // if tag already exists, return false
213 // use EditTag explicitly instead
214 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
215 // TODO: set error string?
219 // otherwise, convert value to string
220 union { T value; char valueBuffer[sizeof(T)]; } un;
223 // copy original tag data to temp buffer
224 const std::string newTag = tag + type;
225 const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
226 RaiiBuffer originalTagData(newTagDataLength);
227 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
230 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
231 memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
233 // store temp buffer back in TagData
234 const char* newTagData = (const char*)originalTagData.Buffer;
235 TagData.assign(newTagData, newTagDataLength);
240 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
241 const std::string& type,
242 const std::string& value)
244 // if char data not populated, do that first
245 if ( SupportData.HasCoreOnly )
248 // check tag/type size
249 if ( !IsValidSize(tag, type) ) {
250 // TODO: set error string?
254 // check that storage type code is OK for string
255 if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) ) {
256 // TODO: set error string?
260 // localize the tag data
261 char* pTagData = (char*)TagData.data();
262 const unsigned int tagDataLength = TagData.size();
263 unsigned int numBytesParsed = 0;
265 // if tag already exists, return false
266 // use EditTag explicitly instead
267 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
268 // TODO: set error string?
272 // otherwise, copy tag data to temp buffer
273 const std::string newTag = tag + type + value;
274 const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
275 RaiiBuffer originalTagData(newTagDataLength);
276 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
278 // append newTag (removes original null-term, then appends newTag + null-term)
279 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
281 // store temp buffer back in TagData
282 const char* newTagData = (const char*)originalTagData.Buffer;
283 TagData.assign(newTagData, newTagDataLength);
287 /*! \fn template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values)
288 \brief Adds a numeric array field to the BAM tags.
290 Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
292 \param[in] tag 2-character tag name
293 \param[in] values vector of data values to store
294 \return \c true if the \b new tag was added successfully
295 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
298 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values) {
300 // if char data not populated, do that first
301 if ( SupportData.HasCoreOnly )
304 // check for valid tag name length
305 if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
308 // localize the tag data
309 char* pTagData = (char*)TagData.data();
310 const unsigned int tagDataLength = TagData.size();
311 unsigned int numBytesParsed = 0;
313 // if tag already exists, return false
314 // use EditTag explicitly instead
315 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
316 // TODO: set error string?
320 // build new tag's base information
321 char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
322 memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
323 newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
324 newTagBase[3] = TagTypeHelper<T>::TypeCode();
326 // add number of array elements to newTagBase
327 const int32_t numElements = values.size();
328 memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
330 // copy current TagData string to temp buffer, leaving room for new tag's contents
331 const size_t newTagDataLength = tagDataLength +
332 Constants::BAM_TAG_ARRAYBASE_SIZE +
333 numElements*sizeof(T);
334 RaiiBuffer originalTagData(newTagDataLength);
335 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
337 // write newTagBase (removes old null term)
338 strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
340 // add vector elements to tag
341 int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
342 for ( int i = 0 ; i < numElements; ++i ) {
343 const T& value = values.at(i);
344 memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
347 // store temp buffer back in TagData
348 const char* newTagData = (const char*)originalTagData.Buffer;
349 TagData.assign(newTagData, newTagDataLength);
353 /*! \fn template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value)
354 \brief Edits a BAM tag field.
356 If \a tag does not exist, a new entry is created.
358 \param tag[in] 2-character tag name
359 \param type[in] 1-character tag type (must be "Z" or "H")
360 \param value[in] new data value
362 \return \c true if the tag was modified/created successfully
364 \sa BamAlignment::RemoveTag()
365 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
368 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value) {
370 // if char data not populated, do that first
371 if ( SupportData.HasCoreOnly )
374 // remove existing tag if present, then append tag with new value
377 return AddTag(tag, type, value);
380 /*! \fn template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values)
381 \brief Edits a BAM tag field containing a numeric array.
383 If \a tag does not exist, a new entry is created.
385 \param tag[in] 2-character tag name
386 \param value[in] vector of data values
388 \return \c true if the tag was modified/created successfully
389 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
392 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values) {
394 // if char data not populated, do that first
395 if ( SupportData.HasCoreOnly )
398 // remove existing tag if present, then append tag with new values
401 return AddTag(tag, values);
405 /*! \fn template<typename T> bool GetTag(const std::string& tag, T& destination) const
406 \brief Retrieves the value associated with a BAM tag.
408 \param tag[in] 2-character tag name
409 \param destination[out] retrieved value
410 \return \c true if found
413 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const {
415 // skip if alignment is core-only
416 if ( SupportData.HasCoreOnly ) {
417 // TODO: set error string?
421 // skip if no tags present
422 if ( TagData.empty() ) {
423 // TODO: set error string?
427 // localize the tag data
428 char* pTagData = (char*)TagData.data();
429 const unsigned int tagDataLength = TagData.size();
430 unsigned int numBytesParsed = 0;
432 // return failure if tag not found
433 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
434 // TODO: set error string?
439 const char type = *(pTagData - 1);
440 if ( !TagTypeHelper<T>::CanConvertFrom(type) ) {
441 // TODO: set error string ?
445 // determine data length
446 int destinationLength = 0;
450 case (Constants::BAM_TAG_TYPE_ASCII) :
451 case (Constants::BAM_TAG_TYPE_INT8) :
452 case (Constants::BAM_TAG_TYPE_UINT8) :
453 destinationLength = 1;
457 case (Constants::BAM_TAG_TYPE_INT16) :
458 case (Constants::BAM_TAG_TYPE_UINT16) :
459 destinationLength = 2;
463 case (Constants::BAM_TAG_TYPE_INT32) :
464 case (Constants::BAM_TAG_TYPE_UINT32) :
465 case (Constants::BAM_TAG_TYPE_FLOAT) :
466 destinationLength = 4;
469 // var-length types not supported for numeric destination
470 case (Constants::BAM_TAG_TYPE_STRING) :
471 case (Constants::BAM_TAG_TYPE_HEX) :
472 case (Constants::BAM_TAG_TYPE_ARRAY) :
473 SetErrorString("BamAlignment::GetTag",
474 "cannot store variable length tag data into a numeric destination");
477 // unrecognized tag type
479 const std::string message = std::string("invalid tag type: ") + type;
480 SetErrorString("BamAlignment::GetTag", message);
484 // store data in destination
486 memcpy(&destination, pTagData, destinationLength);
493 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
494 std::string& destination) const
496 // skip if alignment is core-only
497 if ( SupportData.HasCoreOnly ) {
498 // TODO: set error string?
502 // skip if no tags present
503 if ( TagData.empty() ) {
504 // TODO: set error string?
508 // localize the tag data
509 char* pTagData = (char*)TagData.data();
510 const unsigned int tagDataLength = TagData.size();
511 unsigned int numBytesParsed = 0;
513 // return failure if tag not found
514 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
515 // TODO: set error string?
519 // otherwise copy data into destination
520 const unsigned int dataLength = strlen(pTagData);
522 destination.resize(dataLength);
523 memcpy( (char*)destination.data(), pTagData, dataLength );
529 /*! \fn template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const
530 \brief Retrieves the numeric array associated with a BAM tag.
532 \param tag[in] 2-character tag name
533 \param destination[out] retrieved values
534 \return \c true if found
537 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const {
539 // skip if alignment is core-only
540 if ( SupportData.HasCoreOnly ) {
541 // TODO: set error string?
545 // skip if no tags present
546 if ( TagData.empty() ) {
547 // TODO: set error string?
551 // localize the tag data
552 char* pTagData = (char*)TagData.data();
553 const unsigned int tagDataLength = TagData.size();
554 unsigned int numBytesParsed = 0;
556 // return false if tag not found
557 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
558 // TODO: set error string?
562 // check that tag is array type
563 const char tagType = *(pTagData - 1);
564 if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
565 SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
569 // fetch element type
570 const char elementType = *pTagData;
571 if ( !TagTypeHelper<T>::CanConvertFrom(elementType) ) {
572 // TODO: set error string ?
577 // calculate length of each element in tag's array
578 int elementLength = 0;
579 switch ( elementType ) {
580 case (Constants::BAM_TAG_TYPE_ASCII) :
581 case (Constants::BAM_TAG_TYPE_INT8) :
582 case (Constants::BAM_TAG_TYPE_UINT8) :
583 elementLength = sizeof(uint8_t);
586 case (Constants::BAM_TAG_TYPE_INT16) :
587 case (Constants::BAM_TAG_TYPE_UINT16) :
588 elementLength = sizeof(uint16_t);
591 case (Constants::BAM_TAG_TYPE_INT32) :
592 case (Constants::BAM_TAG_TYPE_UINT32) :
593 case (Constants::BAM_TAG_TYPE_FLOAT) :
594 elementLength = sizeof(uint32_t);
597 // var-length types not supported for numeric destination
598 case (Constants::BAM_TAG_TYPE_STRING) :
599 case (Constants::BAM_TAG_TYPE_HEX) :
600 case (Constants::BAM_TAG_TYPE_ARRAY) :
601 SetErrorString("BamAlignment::GetTag",
602 "invalid array data, variable-length elements are not allowed");
607 const std::string message = std::string("invalid array element type: ") + elementType;
608 SetErrorString("BamAlignment::GetTag", message);
612 // get number of elements
614 memcpy(&numElements, pTagData, sizeof(int32_t));
617 destination.reserve(numElements);
621 for ( int i = 0 ; i < numElements; ++i ) {
622 memcpy(&value, pTagData, sizeof(T));
623 pTagData += sizeof(T);
624 destination.push_back(value);
631 typedef std::vector<BamAlignment> BamAlignmentVector;
633 } // namespace BamTools
635 #endif // BAMALIGNMENT_H