1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 16 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
13 #include "api/api_global.h"
14 #include "api/BamAux.h"
15 #include "api/BamConstants.h"
24 // forward declaration of BamAlignment's "friends"
26 class BamReaderPrivate;
27 class BamWriterPrivate;
28 } // namespace Internal
31 // BamAlignment data structure
32 struct API_EXPORT BamAlignment {
34 // constructors & destructor
37 BamAlignment(const BamAlignment& other);
40 // queries against alignment flags
42 bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
43 bool IsFailedQC(void) const; // returns true if this read failed quality control
44 bool IsFirstMate(void) const; // returns true if alignment is first mate on read
45 bool IsMapped(void) const; // returns true if alignment is mapped
46 bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
47 bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
48 bool IsPaired(void) const; // returns true if alignment part of paired-end read
49 bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
50 bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
51 bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
52 bool IsSecondMate(void) const; // returns true if alignment is second mate on read
54 // manipulate alignment flags
56 void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
57 void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
58 void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
59 void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
60 void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
61 void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
62 void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
63 void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
64 void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
65 void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
66 void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
68 // tag data access methods
72 template<typename T> bool AddTag(const std::string& tag, const std::string& type, const T& value);
73 template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values);
75 // edit (or append) tag
76 template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value);
77 template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values);
80 template<typename T> bool GetTag(const std::string& tag, T& destination) const;
81 template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const;
83 // retrieves the SAM/BAM type-code for requested tag name
84 bool GetTagType(const std::string& tag, char& type) const;
86 // returns true if alignment has a record for this tag name
87 bool HasTag(const std::string& tag) const;
90 void RemoveTag(const std::string& tag);
94 // populates alignment string fields
95 bool BuildCharData(void);
97 // calculates alignment end position
98 int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
100 // returns a description of the last error that occurred
101 std::string GetErrorString(void) const;
103 // retrieves the size, read locations and reference locations of soft-clip operations
104 bool GetSoftClips(std::vector<int>& clipSizes,
105 std::vector<int>& readPositions,
106 std::vector<int>& genomePositions,
107 bool usePadded = false) const;
109 // public data fields
111 std::string Name; // read name
112 int32_t Length; // length of query sequence
113 std::string QueryBases; // 'original' sequence (as reported from sequencing machine)
114 std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping)
115 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
116 std::string TagData; // tag data (use provided methods to query/modify)
117 int32_t RefID; // ID number for reference sequence
118 int32_t Position; // position (0-based) where alignment starts
119 uint16_t Bin; // BAM (standard) index bin number for this alignment
120 uint16_t MapQuality; // mapping quality score
121 uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
122 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
123 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
124 int32_t MatePosition; // position (0-based) where alignment's mate starts
125 int32_t InsertSize; // mate-pair insert size
126 std::string Filename; // name of BAM file which this alignment comes from
129 // internal utility methods
131 bool FindTag(const std::string& tag,
133 const unsigned int& tagDataLength,
134 unsigned int& numBytesParsed) const;
135 bool IsValidSize(const std::string& tag, const std::string& type) const;
136 void SetErrorString(const std::string& where, const std::string& what) const;
137 bool SkipToNextTag(const char storageType,
139 unsigned int& numBytesParsed) const;
144 struct BamAlignmentSupportData {
147 std::string AllCharData;
148 uint32_t BlockLength;
149 uint32_t NumCigarOperations;
150 uint32_t QueryNameLength;
151 uint32_t QuerySequenceLength;
155 BamAlignmentSupportData(void)
157 , NumCigarOperations(0)
159 , QuerySequenceLength(0)
163 BamAlignmentSupportData SupportData;
164 friend class Internal::BamReaderPrivate;
165 friend class Internal::BamWriterPrivate;
167 mutable std::string ErrorString; // mutable to allow updates even in logically const methods
171 // ---------------------------------------------------------
172 // BamAlignment tag access methods
174 /*! \fn bool AddTag(const std::string& tag, const std::string& type, const T& value)
175 \brief Adds a field to the BAM tags.
177 Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
179 \param[in] tag 2-character tag name
180 \param[in] type 1-character tag type
181 \param[in] value data to store
182 \return \c true if the \b new tag was added successfully
183 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
186 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value) {
188 // if char data not populated, do that first
189 if ( SupportData.HasCoreOnly )
192 // check tag/type size
193 if ( !IsValidSize(tag, type) ) {
194 // TODO: set error string?
198 // check that storage type code is OK for T
199 if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) ) {
200 // TODO: set error string?
204 // localize the tag data
205 char* pTagData = (char*)TagData.data();
206 const unsigned int tagDataLength = TagData.size();
207 unsigned int numBytesParsed = 0;
209 // if tag already exists, return false
210 // use EditTag explicitly instead
211 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
212 // TODO: set error string?
216 // otherwise, convert value to string
217 union { T value; char valueBuffer[sizeof(T)]; } un;
220 // copy original tag data to temp buffer
221 const std::string newTag = tag + type;
222 const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
223 RaiiBuffer originalTagData(newTagDataLength);
224 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
227 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
228 memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
230 // store temp buffer back in TagData
231 const char* newTagData = (const char*)originalTagData.Buffer;
232 TagData.assign(newTagData, newTagDataLength);
237 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
238 const std::string& type,
239 const std::string& value)
241 // if char data not populated, do that first
242 if ( SupportData.HasCoreOnly )
245 // check tag/type size
246 if ( !IsValidSize(tag, type) ) {
247 // TODO: set error string?
251 // check that storage type code is OK for string
252 if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) ) {
253 // TODO: set error string?
257 // localize the tag data
258 char* pTagData = (char*)TagData.data();
259 const unsigned int tagDataLength = TagData.size();
260 unsigned int numBytesParsed = 0;
262 // if tag already exists, return false
263 // use EditTag explicitly instead
264 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
265 // TODO: set error string?
269 // otherwise, copy tag data to temp buffer
270 const std::string newTag = tag + type + value;
271 const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
272 RaiiBuffer originalTagData(newTagDataLength);
273 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
275 // append newTag (removes original null-term, then appends newTag + null-term)
276 strcat(originalTagData.Buffer + tagDataLength, newTag.data());
278 // store temp buffer back in TagData
279 const char* newTagData = (const char*)originalTagData.Buffer;
280 TagData.assign(newTagData, newTagDataLength);
284 /*! \fn template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values)
285 \brief Adds a numeric array field to the BAM tags.
287 Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
289 \param[in] tag 2-character tag name
290 \param[in] values vector of data values to store
291 \return \c true if the \b new tag was added successfully
292 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
295 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values) {
297 // if char data not populated, do that first
298 if ( SupportData.HasCoreOnly )
301 // check for valid tag name length
302 if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
305 // localize the tag data
306 char* pTagData = (char*)TagData.data();
307 const unsigned int tagDataLength = TagData.size();
308 unsigned int numBytesParsed = 0;
310 // if tag already exists, return false
311 // use EditTag explicitly instead
312 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
313 // TODO: set error string?
317 // build new tag's base information
318 char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
319 memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
320 newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
321 newTagBase[3] = TagTypeHelper<T>::TypeCode();
323 // add number of array elements to newTagBase
324 const int32_t numElements = values.size();
325 memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
327 // copy current TagData string to temp buffer, leaving room for new tag's contents
328 const size_t newTagDataLength = tagDataLength +
329 Constants::BAM_TAG_ARRAYBASE_SIZE +
330 numElements*sizeof(T);
331 RaiiBuffer originalTagData(newTagDataLength);
332 memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
334 // write newTagBase (removes old null term)
335 strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
337 // add vector elements to tag
338 int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
339 for ( int i = 0 ; i < numElements; ++i ) {
340 const T& value = values.at(i);
341 memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
344 // store temp buffer back in TagData
345 const char* newTagData = (const char*)originalTagData.Buffer;
346 TagData.assign(newTagData, newTagDataLength);
350 /*! \fn template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value)
351 \brief Edits a BAM tag field.
353 If \a tag does not exist, a new entry is created.
355 \param tag[in] 2-character tag name
356 \param type[in] 1-character tag type (must be "Z" or "H")
357 \param value[in] new data value
359 \return \c true if the tag was modified/created successfully
361 \sa BamAlignment::RemoveTag()
362 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
365 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value) {
367 // if char data not populated, do that first
368 if ( SupportData.HasCoreOnly )
371 // remove existing tag if present, then append tag with new value
374 return AddTag(tag, type, value);
377 /*! \fn template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values)
378 \brief Edits a BAM tag field containing a numeric array.
380 If \a tag does not exist, a new entry is created.
382 \param tag[in] 2-character tag name
383 \param value[in] vector of data values
385 \return \c true if the tag was modified/created successfully
386 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
389 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values) {
391 // if char data not populated, do that first
392 if ( SupportData.HasCoreOnly )
395 // remove existing tag if present, then append tag with new values
398 return AddTag(tag, values);
402 /*! \fn template<typename T> bool GetTag(const std::string& tag, T& destination) const
403 \brief Retrieves the value associated with a BAM tag.
405 \param tag[in] 2-character tag name
406 \param destination[out] retrieved value
407 \return \c true if found
410 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const {
412 // skip if alignment is core-only
413 if ( SupportData.HasCoreOnly ) {
414 // TODO: set error string?
418 // skip if no tags present
419 if ( TagData.empty() ) {
420 // TODO: set error string?
424 // localize the tag data
425 char* pTagData = (char*)TagData.data();
426 const unsigned int tagDataLength = TagData.size();
427 unsigned int numBytesParsed = 0;
429 // return failure if tag not found
430 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
431 // TODO: set error string?
436 const char type = *(pTagData - 1);
437 if ( !TagTypeHelper<T>::CanConvertFrom(type) ) {
438 // TODO: set error string ?
442 // determine data length
443 int destinationLength = 0;
447 case (Constants::BAM_TAG_TYPE_ASCII) :
448 case (Constants::BAM_TAG_TYPE_INT8) :
449 case (Constants::BAM_TAG_TYPE_UINT8) :
450 destinationLength = 1;
454 case (Constants::BAM_TAG_TYPE_INT16) :
455 case (Constants::BAM_TAG_TYPE_UINT16) :
456 destinationLength = 2;
460 case (Constants::BAM_TAG_TYPE_INT32) :
461 case (Constants::BAM_TAG_TYPE_UINT32) :
462 case (Constants::BAM_TAG_TYPE_FLOAT) :
463 destinationLength = 4;
466 // var-length types not supported for numeric destination
467 case (Constants::BAM_TAG_TYPE_STRING) :
468 case (Constants::BAM_TAG_TYPE_HEX) :
469 case (Constants::BAM_TAG_TYPE_ARRAY) :
470 SetErrorString("BamAlignment::GetTag",
471 "cannot store variable length tag data into a numeric destination");
474 // unrecognized tag type
476 const std::string message = std::string("invalid tag type: ") + type;
477 SetErrorString("BamAlignment::GetTag", message);
481 // store data in destination
483 memcpy(&destination, pTagData, destinationLength);
490 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
491 std::string& destination) const
493 // skip if alignment is core-only
494 if ( SupportData.HasCoreOnly ) {
495 // TODO: set error string?
499 // skip if no tags present
500 if ( TagData.empty() ) {
501 // TODO: set error string?
505 // localize the tag data
506 char* pTagData = (char*)TagData.data();
507 const unsigned int tagDataLength = TagData.size();
508 unsigned int numBytesParsed = 0;
510 // return failure if tag not found
511 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
512 // TODO: set error string?
516 // otherwise copy data into destination
517 const unsigned int dataLength = strlen(pTagData);
519 destination.resize(dataLength);
520 memcpy( (char*)destination.data(), pTagData, dataLength );
526 /*! \fn template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const
527 \brief Retrieves the numeric array associated with a BAM tag.
529 \param tag[in] 2-character tag name
530 \param destination[out] retrieved values
531 \return \c true if found
534 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const {
536 // skip if alignment is core-only
537 if ( SupportData.HasCoreOnly ) {
538 // TODO: set error string?
542 // skip if no tags present
543 if ( TagData.empty() ) {
544 // TODO: set error string?
548 // localize the tag data
549 char* pTagData = (char*)TagData.data();
550 const unsigned int tagDataLength = TagData.size();
551 unsigned int numBytesParsed = 0;
553 // return false if tag not found
554 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
555 // TODO: set error string?
559 // check that tag is array type
560 const char tagType = *(pTagData - 1);
561 if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
562 SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
566 // fetch element type
567 const char elementType = *pTagData;
568 if ( !TagTypeHelper<T>::CanConvertFrom(elementType) ) {
569 // TODO: set error string ?
574 // calculate length of each element in tag's array
575 int elementLength = 0;
576 switch ( elementType ) {
577 case (Constants::BAM_TAG_TYPE_ASCII) :
578 case (Constants::BAM_TAG_TYPE_INT8) :
579 case (Constants::BAM_TAG_TYPE_UINT8) :
580 elementLength = sizeof(uint8_t);
583 case (Constants::BAM_TAG_TYPE_INT16) :
584 case (Constants::BAM_TAG_TYPE_UINT16) :
585 elementLength = sizeof(uint16_t);
588 case (Constants::BAM_TAG_TYPE_INT32) :
589 case (Constants::BAM_TAG_TYPE_UINT32) :
590 case (Constants::BAM_TAG_TYPE_FLOAT) :
591 elementLength = sizeof(uint32_t);
594 // var-length types not supported for numeric destination
595 case (Constants::BAM_TAG_TYPE_STRING) :
596 case (Constants::BAM_TAG_TYPE_HEX) :
597 case (Constants::BAM_TAG_TYPE_ARRAY) :
598 SetErrorString("BamAlignment::GetTag",
599 "invalid array data, variable-length elements are not allowed");
604 const std::string message = std::string("invalid array element type: ") + elementType;
605 SetErrorString("BamAlignment::GetTag", message);
609 // get number of elements
611 memcpy(&numElements, pTagData, sizeof(int32_t));
614 destination.reserve(numElements);
618 for ( int i = 0 ; i < numElements; ++i ) {
619 memcpy(&value, pTagData, sizeof(T));
620 pTagData += sizeof(T);
621 destination.push_back(value);
628 typedef std::vector<BamAlignment> BamAlignmentVector;
630 } // namespace BamTools
632 #endif // BAMALIGNMENT_H