1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 4 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
13 #include <api/api_global.h>
14 #include <api/BamAux.h>
15 #include <api/BamConstants.h>
26 // forward declaration of BamAlignment's friend classes
28 class BamReaderPrivate;
29 class BamWriterPrivate;
30 } // namespace Internal
32 // BamAlignment data structure
33 struct API_EXPORT BamAlignment {
35 // constructors & destructor
38 BamAlignment(const BamAlignment& other);
41 // queries against alignment flags
43 bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
44 bool IsFailedQC(void) const; // returns true if this read failed quality control
45 bool IsFirstMate(void) const; // returns true if alignment is first mate on read
46 bool IsMapped(void) const; // returns true if alignment is mapped
47 bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
48 bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
49 bool IsPaired(void) const; // returns true if alignment part of paired-end read
50 bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
51 bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
52 bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
53 bool IsSecondMate(void) const; // returns true if alignment is second mate on read
55 // manipulate alignment flags
57 void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
58 void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
59 void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
60 void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
61 void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
62 void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
63 void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
64 void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
65 void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
66 void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
67 void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
69 // legacy methods (consider deprecated, but still available)
70 void SetIsMateUnmapped(bool ok); // complement of using SetIsMateMapped()
71 void SetIsSecondaryAlignment(bool ok); // complement of using SetIsPrimaryAlignment()
72 void SetIsUnmapped(bool ok); // complement of using SetIsMapped()
74 // tag data access methods
78 template<typename T> bool AddTag(const std::string& tag,
79 const std::string& type,
81 template<typename T> bool AddTag(const std::string& tag,
82 const std::vector<T>& values);
84 // edit (or append) tag
85 template<typename T> bool EditTag(const std::string& tag,
86 const std::string& type,
88 template<typename T> bool EditTag(const std::string& tag,
89 const std::vector<T>& values);
92 template<typename T> bool GetTag(const std::string& tag,
93 T& destination) const;
94 template<typename T> bool GetTag(const std::string& tag,
95 std::vector<T>& destination) const;
97 // retrieves the BAM type-code for requested tag
98 // (returns whether or not tag exists, and type-code is valid)
99 bool GetTagType(const std::string& tag, char& type) const;
101 // legacy methods (consider deprecated, but still available)
102 bool GetEditDistance(uint32_t& editDistance) const; // retrieves value of "NM" tag
103 bool GetReadGroup(std::string& readGroup) const; // retrieves value of "RG" tag
105 // returns true if alignment has a record for this tag name
106 bool HasTag(const std::string& tag) const;
109 bool RemoveTag(const std::string& tag);
111 // additional methods
113 // populates alignment string fields
114 bool BuildCharData(void);
116 // calculates alignment end position
117 int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
119 // public data fields
121 std::string Name; // read name
122 int32_t Length; // length of query sequence
123 std::string QueryBases; // 'original' sequence (as reported from sequencing machine)
124 std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping)
125 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
126 std::string TagData; // tag data (use provided methods to query/modify)
127 int32_t RefID; // ID number for reference sequence
128 int32_t Position; // position (0-based) where alignment starts
129 uint16_t Bin; // BAM (standard) index bin number for this alignment
130 uint16_t MapQuality; // mapping quality score
131 uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
132 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
133 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
134 int32_t MatePosition; // position (0-based) where alignment's mate starts
135 int32_t InsertSize; // mate-pair insert size
136 std::string Filename; // name of BAM file which this alignment comes from
139 // internal utility methods
141 static bool FindTag(const std::string& tag,
143 const unsigned int& tagDataLength,
144 unsigned int& numBytesParsed);
145 static bool IsValidSize(const std::string& tag,
146 const std::string& type);
147 static bool SkipToNextTag(const char storageType,
149 unsigned int& numBytesParsed);
154 struct BamAlignmentSupportData {
157 std::string AllCharData;
158 uint32_t BlockLength;
159 uint32_t NumCigarOperations;
160 uint32_t QueryNameLength;
161 uint32_t QuerySequenceLength;
165 BamAlignmentSupportData(void)
167 , NumCigarOperations(0)
169 , QuerySequenceLength(0)
173 BamAlignmentSupportData SupportData;
174 friend class Internal::BamReaderPrivate;
175 friend class Internal::BamWriterPrivate;
179 // ---------------------------------------------------------
180 // BamAlignment tag access methods
183 inline bool BamAlignment::AddTag(const std::string& tag,
184 const std::string& type,
187 // if char data not populated, do that first
188 if ( SupportData.HasCoreOnly )
191 // validate tag/type size & that storage type code is OK for T
192 if ( !IsValidSize(tag, type) ) return false;
193 if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) )
196 // localize the tag data
197 char* pTagData = (char*)TagData.data();
198 const unsigned int tagDataLength = TagData.size();
199 unsigned int numBytesParsed = 0;
201 // if tag already exists, return false
202 // use EditTag explicitly instead
203 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
206 // otherwise, convert value to string
207 union { T value; char valueBuffer[sizeof(T)]; } un;
210 // copy original tag data to temp buffer
211 const std::string newTag = tag + type;
212 const int newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
213 char* originalTagData = new char[newTagDataLength];
214 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
217 strcat(originalTagData + tagDataLength, newTag.data());
218 memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
220 // store temp buffer back in TagData
221 const char* newTagData = (const char*)originalTagData;
222 TagData.assign(newTagData, newTagDataLength);
224 // clean up & return success
225 delete[] originalTagData;
230 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
231 const std::string& type,
232 const std::string& value)
234 // if char data not populated, do that first
235 if ( SupportData.HasCoreOnly )
238 // validate tag/type size & that storage type code is OK for string
239 if ( !IsValidSize(tag, type) ) return false;
240 if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) )
243 // localize the tag data
244 char* pTagData = (char*)TagData.data();
245 const unsigned int tagDataLength = TagData.size();
246 unsigned int numBytesParsed = 0;
248 // if tag already exists, return false
249 // use EditTag explicitly instead
250 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
253 // otherwise, copy tag data to temp buffer
254 const std::string newTag = tag + type + value;
255 const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
256 char* originalTagData = new char[newTagDataLength];
257 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
260 strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
262 // store temp buffer back in TagData
263 const char* newTagData = (const char*)originalTagData;
264 TagData.assign(newTagData, newTagDataLength);
266 // clean up & return success
267 delete[] originalTagData;
272 inline bool BamAlignment::AddTag(const std::string& tag,
273 const std::vector<T>& values)
275 // if char data not populated, do that first
276 if ( SupportData.HasCoreOnly )
279 // check for valid tag name length
280 if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
283 // localize the tag data
284 char* pTagData = (char*)TagData.data();
285 const unsigned int tagDataLength = TagData.size();
286 unsigned int numBytesParsed = 0;
288 // if tag already exists, return false
289 // use EditTag explicitly instead
290 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
293 // build new tag's base information
294 char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
295 memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
296 newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
297 newTagBase[3] = TagTypeHelper<T>::TypeCode();
299 // add number of array elements to newTagBase
300 const int32_t numElements = values.size();
301 memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
303 // copy current TagData string to temp buffer, leaving room for new tag's contents
304 const int newTagDataLength = tagDataLength +
305 Constants::BAM_TAG_ARRAYBASE_SIZE +
306 numElements*sizeof(T);
307 char* originalTagData = new char[newTagDataLength];
308 memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
310 // write newTagBase (removes old null term)
311 strcat(originalTagData + tagDataLength, (const char*)newTagBase);
313 // add vector elements to tag
314 int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
315 for ( int i = 0 ; i < numElements; ++i ) {
316 const T& value = values.at(i);
317 memcpy(originalTagData + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
320 // store temp buffer back in TagData
321 const char* newTagData = (const char*)originalTagData;
322 TagData.assign(newTagData, newTagDataLength);
324 // cleanup & return success
325 delete[] originalTagData;
330 inline bool BamAlignment::EditTag(const std::string& tag,
331 const std::string& type,
334 // if char data not populated, do that first
335 if ( SupportData.HasCoreOnly )
338 // remove existing tag if present, then append tag with new value
341 return AddTag(tag, type, value);
345 inline bool BamAlignment::EditTag(const std::string& tag,
346 const std::vector<T>& values)
348 // if char data not populated, do that first
349 if ( SupportData.HasCoreOnly )
352 // remove existing tag if present, then append tag with new values
355 return AddTag(tag, values);
359 inline bool BamAlignment::GetTag(const std::string& tag,
360 T& destination) const
362 // skip if core-only or no tags present
363 if ( SupportData.HasCoreOnly || TagData.empty() )
366 // localize the tag data
367 char* pTagData = (char*)TagData.data();
368 const unsigned int tagDataLength = TagData.size();
369 unsigned int numBytesParsed = 0;
371 // return failure if tag not found
372 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
375 // otherwise try to copy data into destination
376 const char type = *(pTagData - 1);
377 if ( !TagTypeHelper<T>::CanConvertFrom(type) )
379 int destinationLength = 0;
383 case (Constants::BAM_TAG_TYPE_ASCII) :
384 case (Constants::BAM_TAG_TYPE_INT8) :
385 case (Constants::BAM_TAG_TYPE_UINT8) :
386 destinationLength = 1;
390 case (Constants::BAM_TAG_TYPE_INT16) :
391 case (Constants::BAM_TAG_TYPE_UINT16) :
392 destinationLength = 2;
396 case (Constants::BAM_TAG_TYPE_INT32) :
397 case (Constants::BAM_TAG_TYPE_UINT32) :
398 case (Constants::BAM_TAG_TYPE_FLOAT) :
399 destinationLength = 4;
402 // var-length types not supported for numeric destination
403 case (Constants::BAM_TAG_TYPE_STRING) :
404 case (Constants::BAM_TAG_TYPE_HEX) :
405 case (Constants::BAM_TAG_TYPE_ARRAY) :
406 std::cerr << "BamAlignment ERROR: cannot store tag of type " << type
407 << " in integer destination" << std::endl;
410 // unrecognized tag type
412 std::cerr << "BamAlignment ERROR: unknown tag type encountered: "
413 << type << std::endl;
417 // store in destination
419 memcpy(&destination, pTagData, destinationLength);
426 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
427 std::string& destination) const
429 // skip if core-only or no tags present
430 if ( SupportData.HasCoreOnly || TagData.empty() )
433 // localize the tag data
434 char* pTagData = (char*)TagData.data();
435 const unsigned int tagDataLength = TagData.size();
436 unsigned int numBytesParsed = 0;
438 // return failure if tag not found
439 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
442 // otherwise copy data into destination
443 const unsigned int dataLength = strlen(pTagData);
445 destination.resize(dataLength);
446 memcpy( (char*)destination.data(), pTagData, dataLength );
452 // retrieves "binary-array" tag data
454 inline bool BamAlignment::GetTag(const std::string& tag,
455 std::vector<T>& destination) const
457 // skip if core-only or no tags present
458 if ( SupportData.HasCoreOnly || TagData.empty() )
461 // localize the tag data
462 char* pTagData = (char*)TagData.data();
463 const unsigned int tagDataLength = TagData.size();
464 unsigned int numBytesParsed = 0;
466 // return false if tag not found
467 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
470 // check that tag is array type
471 const char tagType = *(pTagData - 1);
472 if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
473 std::cerr << "BamAlignment ERROR: Cannot store non-array data from tag: "
474 << tag << " in array destination" << std::endl;
478 // calculate length of each element in tag's array
479 const char elementType = *pTagData;
480 if ( !TagTypeHelper<T>::CanConvertFrom(elementType) )
483 int elementLength = 0;
484 switch ( elementType ) {
485 case (Constants::BAM_TAG_TYPE_ASCII) :
486 case (Constants::BAM_TAG_TYPE_INT8) :
487 case (Constants::BAM_TAG_TYPE_UINT8) :
488 elementLength = sizeof(uint8_t);
491 case (Constants::BAM_TAG_TYPE_INT16) :
492 case (Constants::BAM_TAG_TYPE_UINT16) :
493 elementLength = sizeof(uint16_t);
496 case (Constants::BAM_TAG_TYPE_INT32) :
497 case (Constants::BAM_TAG_TYPE_UINT32) :
498 case (Constants::BAM_TAG_TYPE_FLOAT) :
499 elementLength = sizeof(uint32_t);
502 // var-length types not supported for numeric destination
503 case (Constants::BAM_TAG_TYPE_STRING) :
504 case (Constants::BAM_TAG_TYPE_HEX) :
505 case (Constants::BAM_TAG_TYPE_ARRAY) :
506 std::cerr << "BamAlignment ERROR: array element type: " << elementType
507 << " cannot be stored in integer value" << std::endl;
512 std::cerr << "BamAlignment ERROR: unknown element type encountered: "
513 << elementType << std::endl;
517 // get number of elements
519 memcpy(&numElements, pTagData, sizeof(int32_t));
522 destination.reserve(numElements);
526 for ( int i = 0 ; i < numElements; ++i ) {
527 memcpy(&value, pTagData, sizeof(T));
528 pTagData += sizeof(T);
529 destination.push_back(value);
536 typedef std::vector<BamAlignment> BamAlignmentVector;
538 } // namespace BamTools
540 #endif // BAMALIGNMENT_H