1 // ***************************************************************************
2 // BamAlignment.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 19 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the BamAlignment data structure
9 // ***************************************************************************
18 #include "BamAlignment.h"
19 using namespace BamTools;
23 BamAlignment::BamAlignment(void)
32 BamAlignment::BamAlignment(const BamAlignment& other)
34 , Length(other.Length)
35 , QueryBases(other.QueryBases)
36 , AlignedBases(other.AlignedBases)
37 , Qualities(other.Qualities)
38 , TagData(other.TagData)
40 , Position(other.Position)
42 , MapQuality(other.MapQuality)
43 , AlignmentFlag(other.AlignmentFlag)
44 , CigarData(other.CigarData)
45 , MateRefID(other.MateRefID)
46 , MatePosition(other.MatePosition)
47 , InsertSize(other.InsertSize)
48 , SupportData(other.SupportData)
52 BamAlignment::~BamAlignment(void) { }
54 // Queries against alignment flags
55 bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
56 bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
57 bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
58 bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
59 bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
60 bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
61 bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
62 bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
63 bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
64 bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
65 bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
67 // Manipulate alignment flags
68 void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
69 void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
70 void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
71 void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
72 void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
73 void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
74 void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
75 void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
76 void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
77 void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
78 void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
80 // calculates alignment end position, based on starting position and CIGAR operations
81 int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
83 // initialize alignment end to starting position
84 int alignEnd = Position;
86 // iterate over cigar operations
87 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
88 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
89 for ( ; cigarIter != cigarEnd; ++cigarIter) {
90 const char cigarType = (*cigarIter).Type;
91 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' )
92 alignEnd += (*cigarIter).Length;
93 else if ( usePadded && cigarType == 'I' )
94 alignEnd += (*cigarIter).Length;
97 // adjust for zeroBased, if necessary
104 bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) {
106 if ( SupportData.HasCoreOnly ) return false;
107 if ( tag.size() != 2 || type.size() != 1 ) return false;
108 if ( type != "Z" && type != "H" ) return false;
110 // localize the tag data
111 char* pTagData = (char*)TagData.data();
112 const unsigned int tagDataLength = TagData.size();
113 unsigned int numBytesParsed = 0;
115 // if tag already exists, return false
116 // use EditTag explicitly instead
117 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
119 // otherwise, copy tag data to temp buffer
120 string newTag = tag + type + value;
121 const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
122 char originalTagData[newTagDataLength];
123 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
126 strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
128 // store temp buffer back in TagData
129 const char* newTagData = (const char*)originalTagData;
130 TagData.assign(newTagData, newTagDataLength);
136 bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) {
138 if ( SupportData.HasCoreOnly ) return false;
139 if ( tag.size() != 2 || type.size() != 1 ) return false;
140 if ( type == "f" || type == "Z" || type == "H" ) return false;
142 // localize the tag data
143 char* pTagData = (char*)TagData.data();
144 const unsigned int tagDataLength = TagData.size();
145 unsigned int numBytesParsed = 0;
147 // if tag already exists, return false
148 // use EditTag explicitly instead
149 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
151 // otherwise, convert value to string
152 union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
155 // copy original tag data to temp buffer
156 string newTag = tag + type;
157 const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
158 char originalTagData[newTagDataLength];
159 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
162 strcat(originalTagData + tagDataLength, newTag.data());
163 memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));
165 // store temp buffer back in TagData
166 const char* newTagData = (const char*)originalTagData;
167 TagData.assign(newTagData, newTagDataLength);
173 bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) {
174 return AddTag(tag, type, (const uint32_t&)value);
177 bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) {
179 if ( SupportData.HasCoreOnly ) return false;
180 if ( tag.size() != 2 || type.size() != 1 ) return false;
181 if ( type == "Z" || type == "H" ) return false;
183 // localize the tag data
184 char* pTagData = (char*)TagData.data();
185 const unsigned int tagDataLength = TagData.size();
186 unsigned int numBytesParsed = 0;
188 // if tag already exists, return false
189 // use EditTag explicitly instead
190 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
192 // otherwise, convert value to string
193 union { float value; char valueBuffer[sizeof(float)]; } un;
196 // copy original tag data to temp buffer
197 string newTag = tag + type;
198 const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
199 char originalTagData[newTagDataLength];
200 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
203 strcat(originalTagData + tagDataLength, newTag.data());
204 memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
206 // store temp buffer back in TagData
207 const char* newTagData = (const char*)originalTagData;
208 TagData.assign(newTagData, newTagDataLength);
214 bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) {
216 if ( SupportData.HasCoreOnly ) return false;
217 if ( tag.size() != 2 || type.size() != 1 ) return false;
218 if ( type != "Z" && type != "H" ) return false;
220 // localize the tag data
221 char* pOriginalTagData = (char*)TagData.data();
222 char* pTagData = pOriginalTagData;
223 const unsigned int originalTagDataLength = TagData.size();
225 unsigned int newTagDataLength = 0;
226 unsigned int numBytesParsed = 0;
228 // if tag found, store data in readGroup, return success
229 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
231 // make sure array is more than big enough
232 char newTagData[originalTagDataLength + value.size()];
234 // copy original tag data up til desired tag
235 const unsigned int beginningTagDataLength = numBytesParsed;
236 newTagDataLength += beginningTagDataLength;
237 memcpy(newTagData, pOriginalTagData, numBytesParsed);
239 // copy new VALUE in place of current tag data
240 const unsigned int dataLength = strlen(value.c_str());
241 memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
243 // skip to next tag (if tag for removal is last, return true)
244 const char* pTagStorageType = pTagData - 1;
245 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
247 // copy everything from current tag (the next one after tag for removal) to end
248 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
249 const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
250 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
251 memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
253 // ensure null-terminator
254 newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
257 TagData.assign(newTagData, endTagOffset + endTagDataLength);
261 // tag not found, attempt AddTag
262 else return AddTag(tag, type, value);
265 bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) {
267 if ( SupportData.HasCoreOnly ) return false;
268 if ( tag.size() != 2 || type.size() != 1 ) return false;
269 if ( type == "f" || type == "Z" || type == "H" ) return false;
271 // localize the tag data
272 char* pOriginalTagData = (char*)TagData.data();
273 char* pTagData = pOriginalTagData;
274 const unsigned int originalTagDataLength = TagData.size();
276 unsigned int newTagDataLength = 0;
277 unsigned int numBytesParsed = 0;
279 // if tag found, store data in readGroup, return success
280 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
282 // make sure array is more than big enough
283 char newTagData[originalTagDataLength + sizeof(value)];
285 // copy original tag data up til desired tag
286 const unsigned int beginningTagDataLength = numBytesParsed;
287 newTagDataLength += beginningTagDataLength;
288 memcpy(newTagData, pOriginalTagData, numBytesParsed);
290 // copy new VALUE in place of current tag data
291 union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
293 memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));
295 // skip to next tag (if tag for removal is last, return true)
296 const char* pTagStorageType = pTagData - 1;
297 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
299 // copy everything from current tag (the next one after tag for removal) to end
300 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
301 const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);
302 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
303 memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
305 // ensure null-terminator
306 newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
309 TagData.assign(newTagData, endTagOffset + endTagDataLength);
313 // tag not found, attempt AddTag
314 else return AddTag(tag, type, value);
317 bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) {
318 return EditTag(tag, type, (const uint32_t&)value);
321 bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) {
323 if ( SupportData.HasCoreOnly ) return false;
324 if ( tag.size() != 2 || type.size() != 1 ) return false;
325 if ( type == "Z" || type == "H" ) return false;
327 // localize the tag data
328 char* pOriginalTagData = (char*)TagData.data();
329 char* pTagData = pOriginalTagData;
330 const unsigned int originalTagDataLength = TagData.size();
332 unsigned int newTagDataLength = 0;
333 unsigned int numBytesParsed = 0;
335 // if tag found, store data in readGroup, return success
336 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
338 // make sure array is more than big enough
339 char newTagData[originalTagDataLength + sizeof(value)];
341 // copy original tag data up til desired tag
342 const unsigned int beginningTagDataLength = numBytesParsed;
343 newTagDataLength += beginningTagDataLength;
344 memcpy(newTagData, pOriginalTagData, numBytesParsed);
346 // copy new VALUE in place of current tag data
347 union { float value; char valueBuffer[sizeof(float)]; } un;
349 memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
351 // skip to next tag (if tag for removal is last, return true)
352 const char* pTagStorageType = pTagData - 1;
353 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
355 // copy everything from current tag (the next one after tag for removal) to end
356 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
357 const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
358 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
359 memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
361 // ensure null-terminator
362 newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
365 TagData.assign(newTagData, endTagOffset + endTagDataLength);
369 // tag not found, attempt AddTag
370 else return AddTag(tag, type, value);
373 // get "NM" tag data - originally contributed by Aaron Quinlan
374 // stores data in 'editDistance', returns success/fail
375 bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
376 return GetTag("NM", (uint32_t&)editDistance);
380 // stores data in 'readGroup', returns success/fail
381 bool BamAlignment::GetReadGroup(string& readGroup) const {
382 return GetTag("RG", readGroup);
385 bool BamAlignment::GetTag(const string& tag, string& destination) const {
387 // make sure tag data exists
388 if ( SupportData.HasCoreOnly || TagData.empty() )
391 // localize the tag data
392 char* pTagData = (char*)TagData.data();
393 const unsigned int tagDataLength = TagData.size();
394 unsigned int numBytesParsed = 0;
396 // if tag found, store data in readGroup, return success
397 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
398 const unsigned int dataLength = strlen(pTagData);
400 destination.resize(dataLength);
401 memcpy( (char*)destination.data(), pTagData, dataLength );
405 // tag not found, return failure
409 bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const {
411 // make sure tag data exists
412 if ( SupportData.HasCoreOnly || TagData.empty() )
415 // localize the tag data
416 char* pTagData = (char*)TagData.data();
417 const unsigned int tagDataLength = TagData.size();
418 unsigned int numBytesParsed = 0;
420 // if tag found, determine data byte-length, store data in readGroup, return success
421 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
423 // determine data byte-length
424 const char type = *(pTagData - 1);
425 int destinationLength = 0;
431 destinationLength = 1;
437 destinationLength = 2;
443 destinationLength = 4;
446 // unsupported type for integer destination (float or var-length strings)
450 fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
455 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
459 // store in destination
461 memcpy(&destination, pTagData, destinationLength);
465 // tag not found, return failure
469 bool BamAlignment::GetTag(const string& tag, int32_t& destination) const {
470 return GetTag(tag, (uint32_t&)destination);
473 bool BamAlignment::GetTag(const string& tag, float& destination) const {
475 // make sure tag data exists
476 if ( SupportData.HasCoreOnly || TagData.empty() )
479 // localize the tag data
480 char* pTagData = (char*)TagData.data();
481 const unsigned int tagDataLength = TagData.size();
482 unsigned int numBytesParsed = 0;
484 // if tag found, determine data byte-length, store data in readGroup, return success
485 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
487 // determine data byte-length
488 const char type = *(pTagData - 1);
489 int destinationLength = 0;
496 destinationLength = 1;
502 destinationLength = 2;
509 destinationLength = 4;
512 // unsupported type (var-length strings)
515 fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
520 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
524 // store in destination
526 memcpy(&destination, pTagData, destinationLength);
530 // tag not found, return failure
534 bool BamAlignment::GetTagType(const string& tag, char& type) const {
536 // make sure tag data exists
537 if ( SupportData.HasCoreOnly || TagData.empty() )
540 // localize the tag data
541 char* pTagData = (char*)TagData.data();
542 const unsigned int tagDataLength = TagData.size();
543 unsigned int numBytesParsed = 0;
546 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
548 // retrieve tag type code
549 type = *(pTagData - 1);
551 // validate that type is a proper BAM tag type
567 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
572 // tag not found, return failure
576 bool BamAlignment::RemoveTag(const string& tag) {
578 // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed
579 // also, return false if no data present to remove
580 if ( SupportData.HasCoreOnly || TagData.empty() ) return false;
582 // localize the tag data
583 char* pOriginalTagData = (char*)TagData.data();
584 char* pTagData = pOriginalTagData;
585 const unsigned int originalTagDataLength = TagData.size();
586 unsigned int newTagDataLength = 0;
587 unsigned int numBytesParsed = 0;
589 // if tag found, store data in readGroup, return success
590 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
592 char newTagData[originalTagDataLength];
594 // copy original tag data up til desired tag
597 const unsigned int beginningTagDataLength = numBytesParsed;
598 newTagDataLength += beginningTagDataLength;
599 memcpy(newTagData, pOriginalTagData, numBytesParsed);
601 // skip to next tag (if tag for removal is last, return true)
602 const char* pTagStorageType = pTagData + 2;
605 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
607 // copy everything from current tag (the next one after tag for removal) to end
608 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
609 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
610 memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
613 TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
617 // tag not found, no removal - return failure
621 bool BamAlignment::FindTag(const string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {
623 while ( numBytesParsed < tagDataLength ) {
625 const char* pTagType = pTagData;
626 const char* pTagStorageType = pTagData + 2;
630 // check the current tag, return true on match
631 if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
634 // get the storage class and find the next tag
635 if ( *pTagStorageType == '\0' ) return false;
636 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
637 if ( *pTagData == '\0' ) return false;
640 // checked all tags, none match
644 bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
646 switch(storageType) {
674 // increment for null-terminator
681 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);