1 // ***************************************************************************
2 // BamAlignment.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 22 February 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/BamConstants.h"
12 using namespace BamTools;
15 /*! \class BamTools::BamAlignment
16 \brief The main BAM alignment data structure.
18 Provides methods to query/modify BAM alignment data fields.
20 /*! \var BamAlignment::Name
23 /*! \var BamAlignment::Length
24 \brief length of query sequence
26 /*! \var BamAlignment::QueryBases
27 \brief 'original' sequence (as reported from sequencing machine)
29 /*! \var BamAlignment::AlignedBases
30 \brief 'aligned' sequence (includes any indels, padding, clipping)
32 /*! \var BamAlignment::Qualities
33 \brief FASTQ qualities (ASCII characters, not numeric values)
35 /*! \var BamAlignment::TagData
36 \brief tag data (use the provided methods to query/modify)
38 /*! \var BamAlignment::RefID
39 \brief ID number for reference sequence
41 /*! \var BamAlignment::Position
42 \brief position (0-based) where alignment starts
44 /*! \var BamAlignment::Bin
45 \brief BAM (standard) index bin number for this alignment
47 /*! \var BamAlignment::MapQuality
48 \brief mapping quality score
50 /*! \var BamAlignment::AlignmentFlag
51 \brief alignment bit-flag (use the provided methods to query/modify)
53 /*! \var BamAlignment::CigarData
54 \brief CIGAR operations for this alignment
56 /*! \var BamAlignment::MateRefID
57 \brief ID number for reference sequence where alignment's mate was aligned
59 /*! \var BamAlignment::MatePosition
60 \brief position (0-based) where alignment's mate starts
62 /*! \var BamAlignment::InsertSize
63 \brief mate-pair insert size
65 /*! \var BamAlignment::Filename
66 \brief name of BAM file which this alignment comes from
69 /*! \fn BamAlignment::BamAlignment(void)
72 BamAlignment::BamAlignment(void)
84 /*! \fn BamAlignment::BamAlignment(const BamAlignment& other)
85 \brief copy constructor
87 BamAlignment::BamAlignment(const BamAlignment& other)
89 , Length(other.Length)
90 , QueryBases(other.QueryBases)
91 , AlignedBases(other.AlignedBases)
92 , Qualities(other.Qualities)
93 , TagData(other.TagData)
95 , Position(other.Position)
97 , MapQuality(other.MapQuality)
98 , AlignmentFlag(other.AlignmentFlag)
99 , CigarData(other.CigarData)
100 , MateRefID(other.MateRefID)
101 , MatePosition(other.MatePosition)
102 , InsertSize(other.InsertSize)
103 , Filename(other.Filename)
104 , SupportData(other.SupportData)
107 /*! \fn BamAlignment::~BamAlignment(void)
110 BamAlignment::~BamAlignment(void) { }
112 /*! \fn bool BamAlignment::BuildCharData(void)
113 \brief Populates alignment string fields (read name, bases, qualities, tag data).
115 An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
116 Using that method makes parsing much quicker when only positional data is required.
118 However, if you later want to access the character data fields from such an alignment,
119 use this method to populate those fields. Provides ability to do 'lazy evaluation' of
122 \return \c true if character data populated successfully (or was already available to begin with)
124 bool BamAlignment::BuildCharData(void) {
126 // skip if char data already parsed
127 if ( !SupportData.HasCoreOnly )
130 // check system endianness
131 bool IsBigEndian = BamTools::SystemIsBigEndian();
133 // calculate character lengths/offsets
134 const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
135 const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4);
136 const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
137 const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
138 const unsigned int tagDataLength = dataLength - tagDataOffset;
140 // check offsets to see what char data exists
141 const bool hasSeqData = ( seqDataOffset < dataLength );
142 const bool hasQualData = ( qualDataOffset < dataLength );
143 const bool hasTagData = ( tagDataOffset < dataLength );
145 // set up char buffers
146 const char* allCharData = SupportData.AllCharData.data();
147 const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
148 const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
149 char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
151 // store alignment name (relies on null char in name as terminator)
152 Name.assign((const char*)(allCharData));
154 // save query sequence
157 QueryBases.reserve(SupportData.QuerySequenceLength);
158 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
159 const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
160 QueryBases.append(1, singleBase);
164 // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
167 Qualities.reserve(SupportData.QuerySequenceLength);
168 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
169 const char singleQuality = static_cast<const char>(qualData[i]+33);
170 Qualities.append(1, singleQuality);
174 // clear previous AlignedBases
175 AlignedBases.clear();
177 // if QueryBases has data, build AlignedBases using CIGAR data
178 // otherwise, AlignedBases will remain empty (this case IS allowed)
179 if ( !QueryBases.empty() ) {
181 // resize AlignedBases
182 AlignedBases.reserve(SupportData.QuerySequenceLength);
184 // iterate over CigarOps
186 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
187 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
188 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
189 const CigarOp& op = (*cigarIter);
193 // for 'M', 'I', '=', 'X' - write bases
194 case (Constants::BAM_CIGAR_MATCH_CHAR) :
195 case (Constants::BAM_CIGAR_INS_CHAR) :
196 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) :
197 case (Constants::BAM_CIGAR_MISMATCH_CHAR) :
198 AlignedBases.append(QueryBases.substr(k, op.Length));
201 // for 'S' - soft clip, do not write bases
202 // but increment placeholder 'k'
203 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) :
207 // for 'D' - write gap character
208 case (Constants::BAM_CIGAR_DEL_CHAR) :
209 AlignedBases.append(op.Length, Constants::BAM_DNA_DEL);
212 // for 'P' - write padding character
213 case (Constants::BAM_CIGAR_PAD_CHAR) :
214 AlignedBases.append( op.Length, Constants::BAM_DNA_PAD );
217 // for 'N' - write N's, skip bases in original query sequence
218 case (Constants::BAM_CIGAR_REFSKIP_CHAR) :
219 AlignedBases.append( op.Length, Constants::BAM_DNA_N );
222 // for 'H' - hard clip, do nothing to AlignedBases, move to next op
223 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) :
226 // invalid CIGAR op-code
228 const string message = string("invalid CIGAR operation type: ") + op.Type;
229 SetErrorString("BamAlignment::BuildCharData", message);
240 while ( i < tagDataLength ) {
242 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
243 const char type = tagData[i]; // get tag type at position i
244 ++i; // move i past tag type
248 case(Constants::BAM_TAG_TYPE_ASCII) :
249 case(Constants::BAM_TAG_TYPE_INT8) :
250 case(Constants::BAM_TAG_TYPE_UINT8) :
251 // no endian swapping necessary for single-byte data
255 case(Constants::BAM_TAG_TYPE_INT16) :
256 case(Constants::BAM_TAG_TYPE_UINT16) :
257 BamTools::SwapEndian_16p(&tagData[i]);
258 i += sizeof(uint16_t);
261 case(Constants::BAM_TAG_TYPE_FLOAT) :
262 case(Constants::BAM_TAG_TYPE_INT32) :
263 case(Constants::BAM_TAG_TYPE_UINT32) :
264 BamTools::SwapEndian_32p(&tagData[i]);
265 i += sizeof(uint32_t);
268 case(Constants::BAM_TAG_TYPE_HEX) :
269 case(Constants::BAM_TAG_TYPE_STRING) :
270 // no endian swapping necessary for hex-string/string data
273 // increment one more for null terminator
277 case(Constants::BAM_TAG_TYPE_ARRAY) :
281 const char arrayType = tagData[i];
284 // swap endian-ness of number of elements in place, then retrieve for loop
285 BamTools::SwapEndian_32p(&tagData[i]);
286 uint32_t numElements;
287 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
288 i += sizeof(uint32_t);
290 // swap endian-ness of array elements
291 for ( size_t j = 0; j < numElements; ++j ) {
293 case (Constants::BAM_TAG_TYPE_INT8) :
294 case (Constants::BAM_TAG_TYPE_UINT8) :
295 // no endian-swapping necessary
298 case (Constants::BAM_TAG_TYPE_INT16) :
299 case (Constants::BAM_TAG_TYPE_UINT16) :
300 BamTools::SwapEndian_16p(&tagData[i]);
301 i += sizeof(uint16_t);
303 case (Constants::BAM_TAG_TYPE_FLOAT) :
304 case (Constants::BAM_TAG_TYPE_INT32) :
305 case (Constants::BAM_TAG_TYPE_UINT32) :
306 BamTools::SwapEndian_32p(&tagData[i]);
307 i += sizeof(uint32_t);
310 const string message = string("invalid binary array type: ") + arrayType;
311 SetErrorString("BamAlignment::BuildCharData", message);
319 // invalid tag type-code
321 const string message = string("invalid tag type: ") + type;
322 SetErrorString("BamAlignment::BuildCharData", message);
328 // store tagData in alignment
329 TagData.resize(tagDataLength);
330 memcpy((char*)(TagData.data()), tagData, tagDataLength);
333 // clear core-only flag & return success
334 SupportData.HasCoreOnly = false;
338 /*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
341 Searches for requested tag in BAM tag data.
343 \param[in] tag requested 2-character tag name
344 \param[in,out] pTagData pointer to current position in BamAlignment::TagData
345 \param[in] tagDataLength length of BamAlignment::TagData
346 \param[in,out] numBytesParsed number of bytes parsed so far
348 \return \c true if found
350 \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
351 \a numBytesParsed will correspond to the position in the full TagData string.
354 bool BamAlignment::FindTag(const std::string& tag,
356 const unsigned int& tagDataLength,
357 unsigned int& numBytesParsed) const
360 while ( numBytesParsed < tagDataLength ) {
362 const char* pTagType = pTagData;
363 const char* pTagStorageType = pTagData + 2;
367 // check the current tag, return true on match
368 if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
371 // get the storage class and find the next tag
372 if ( *pTagStorageType == '\0' ) return false;
373 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
374 if ( *pTagData == '\0' ) return false;
377 // checked all tags, none match
381 /*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
382 \brief Calculates alignment end position, based on its starting position and CIGAR data.
384 \warning The position returned now represents a zero-based, HALF-OPEN interval.
385 In previous versions of BamTools (0.x & 1.x) all intervals were treated
386 as zero-based, CLOSED.
388 \param[in] usePadded Allow inserted bases to affect the reported position. Default is
389 false, so that reported position stays synced with reference
391 \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
392 false, so that his value represents a standard, half-open interval.
394 \return alignment end position
396 int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
398 // initialize alignment end to starting position
399 int alignEnd = Position;
401 // iterate over cigar operations
402 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
403 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
404 for ( ; cigarIter != cigarEnd; ++cigarIter) {
405 const CigarOp& op = (*cigarIter);
409 // increase end position on CIGAR chars [DMXN=]
410 case Constants::BAM_CIGAR_DEL_CHAR :
411 case Constants::BAM_CIGAR_MATCH_CHAR :
412 case Constants::BAM_CIGAR_MISMATCH_CHAR :
413 case Constants::BAM_CIGAR_REFSKIP_CHAR :
414 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
415 alignEnd += op.Length;
418 // increase end position on insertion, only if @usePadded is true
419 case Constants::BAM_CIGAR_INS_CHAR :
421 alignEnd += op.Length;
424 // all other CIGAR chars do not affect end position
430 // adjust for closedInterval, if requested
431 if ( closedInterval )
438 /*! \fn std::string BamAlignment::GetErrorString(void) const
439 \brief Returns a human-readable description of the last error that occurred
441 This method allows elimination of STDERR pollution. Developers of client code
442 may choose how the messages are displayed to the user, if at all.
444 \return error description
446 std::string BamAlignment::GetErrorString(void) const {
450 /*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
451 \brief Identifies if an alignment has a soft clip. If so, identifies the
452 sizes of the soft clips, as well as their positions in the read and reference.
454 \param[out] clipSizes vector of the sizes of each soft clip in the alignment
455 \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
456 These positions are basically indexes within the read, not genomic positions.
457 \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
458 \param[in] usePadded inserted bases affect reported position. Default is false, so that
459 reported position stays 'sync-ed' with reference coordinates.
461 \return \c true if any soft clips were found in the alignment
463 bool BamAlignment::GetSoftClips(vector<int>& clipSizes,
464 vector<int>& readPositions,
465 vector<int>& genomePositions,
466 bool usePadded) const
468 // initialize positions & flags
469 int refPosition = Position;
470 int readPosition = 0;
471 bool softClipFound = false;
472 bool firstCigarOp = true;
474 // iterate over cigar operations
475 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
476 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
477 for ( ; cigarIter != cigarEnd; ++cigarIter) {
478 const CigarOp& op = (*cigarIter);
482 // increase both read & genome positions on CIGAR chars [DMXN=]
483 case Constants::BAM_CIGAR_DEL_CHAR :
484 case Constants::BAM_CIGAR_MATCH_CHAR :
485 case Constants::BAM_CIGAR_MISMATCH_CHAR :
486 case Constants::BAM_CIGAR_REFSKIP_CHAR :
487 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
488 refPosition += op.Length;
489 readPosition += op.Length;
492 // increase read position on insertion, genome position only if @usePadded is true
493 case Constants::BAM_CIGAR_INS_CHAR :
494 readPosition += op.Length;
496 refPosition += op.Length;
499 case Constants::BAM_CIGAR_SOFTCLIP_CHAR :
501 softClipFound = true;
503 //////////////////////////////////////////////////////////////////////////////
504 // if we are dealing with the *first* CIGAR operation
505 // for this alignment, we increment the read position so that
506 // the read and genome position of the clip are referring to the same base.
507 // For example, in the alignment below, the ref position would be 4, yet
508 // the read position would be 0. Thus, to "sync" the two,
509 // we need to increment the read position by the length of the
511 // Read: ATCGTTTCGTCCCTGC
512 // Ref: GGGATTTCGTCCCTGC
513 // Cigar: SSSSMMMMMMMMMMMM
515 // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
516 //////////////////////////////////////////////////////////////////////////////
518 readPosition += op.Length;
520 // track the soft clip's size, read position, and genome position
521 clipSizes.push_back(op.Length);
522 readPositions.push_back(readPosition);
523 genomePositions.push_back(refPosition);
525 // any other CIGAR operations have no effect
530 // clear our "first pass" flag
531 firstCigarOp = false;
534 // return whether any soft clips found
535 return softClipFound;
538 /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
539 \brief Retrieves the BAM tag type-code associated with requested tag name.
541 \param[in] tag 2-character tag name
542 \param[out] type retrieved (1-character) type-code
544 \return \c true if found
545 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
547 bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
549 // skip if alignment is core-only
550 if ( SupportData.HasCoreOnly ) {
551 // TODO: set error string?
555 // skip if no tags present
556 if ( TagData.empty() ) {
557 // TODO: set error string?
561 // localize the tag data
562 char* pTagData = (char*)TagData.data();
563 const unsigned int tagDataLength = TagData.size();
564 unsigned int numBytesParsed = 0;
566 // if tag not found, return failure
567 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
568 // TODO: set error string?
572 // otherwise, retrieve & validate tag type code
573 type = *(pTagData - 1);
575 case (Constants::BAM_TAG_TYPE_ASCII) :
576 case (Constants::BAM_TAG_TYPE_INT8) :
577 case (Constants::BAM_TAG_TYPE_UINT8) :
578 case (Constants::BAM_TAG_TYPE_INT16) :
579 case (Constants::BAM_TAG_TYPE_UINT16) :
580 case (Constants::BAM_TAG_TYPE_INT32) :
581 case (Constants::BAM_TAG_TYPE_UINT32) :
582 case (Constants::BAM_TAG_TYPE_FLOAT) :
583 case (Constants::BAM_TAG_TYPE_STRING) :
584 case (Constants::BAM_TAG_TYPE_HEX) :
585 case (Constants::BAM_TAG_TYPE_ARRAY) :
590 const string message = string("invalid tag type: ") + type;
591 SetErrorString("BamAlignment::GetTagType", message);
596 /*! \fn bool BamAlignment::HasTag(const std::string& tag) const
597 \brief Returns true if alignment has a record for requested tag.
599 \param[in] tag 2-character tag name
600 \return \c true if alignment has a record for tag
602 bool BamAlignment::HasTag(const std::string& tag) const {
604 // return false if no tag data present
605 if ( SupportData.HasCoreOnly || TagData.empty() )
608 // localize the tag data for lookup
609 char* pTagData = (char*)TagData.data();
610 const unsigned int tagDataLength = TagData.size();
611 unsigned int numBytesParsed = 0;
613 // if result of tag lookup
614 return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
617 /*! \fn bool BamAlignment::IsDuplicate(void) const
618 \return \c true if this read is a PCR duplicate
620 bool BamAlignment::IsDuplicate(void) const {
621 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 );
624 /*! \fn bool BamAlignment::IsFailedQC(void) const
625 \return \c true if this read failed quality control
627 bool BamAlignment::IsFailedQC(void) const {
628 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 );
631 /*! \fn bool BamAlignment::IsFirstMate(void) const
632 \return \c true if alignment is first mate on paired-end read
634 bool BamAlignment::IsFirstMate(void) const {
635 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 );
638 /*! \fn bool BamAlignment::IsMapped(void) const
639 \return \c true if alignment is mapped
641 bool BamAlignment::IsMapped(void) const {
642 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 );
645 /*! \fn bool BamAlignment::IsMateMapped(void) const
646 \return \c true if alignment's mate is mapped
648 bool BamAlignment::IsMateMapped(void) const {
649 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 );
652 /*! \fn bool BamAlignment::IsMateReverseStrand(void) const
653 \return \c true if alignment's mate mapped to reverse strand
655 bool BamAlignment::IsMateReverseStrand(void) const {
656 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 );
659 /*! \fn bool BamAlignment::IsPaired(void) const
660 \return \c true if alignment part of paired-end read
662 bool BamAlignment::IsPaired(void) const {
663 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 );
666 /*! \fn bool BamAlignment::IsPrimaryAlignment(void) const
667 \return \c true if reported position is primary alignment
669 bool BamAlignment::IsPrimaryAlignment(void) const {
670 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 );
673 /*! \fn bool BamAlignment::IsProperPair(void) const
674 \return \c true if alignment is part of read that satisfied paired-end resolution
676 bool BamAlignment::IsProperPair(void) const {
677 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 );
680 /*! \fn bool BamAlignment::IsReverseStrand(void) const
681 \return \c true if alignment mapped to reverse strand
683 bool BamAlignment::IsReverseStrand(void) const {
684 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 );
687 /*! \fn bool BamAlignment::IsSecondMate(void) const
688 \return \c true if alignment is second mate on read
690 bool BamAlignment::IsSecondMate(void) const {
691 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
694 /*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
697 Checks that tag name & type strings are expected sizes.
699 \param tag[in] BAM tag name
700 \param type[in] BAM tag type-code
701 \return \c true if both input strings are valid sizes
703 bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const {
704 return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
705 (type.size() == Constants::BAM_TAG_TYPESIZE);
708 /*! \fn void BamAlignment::RemoveTag(const std::string& tag)
709 \brief Removes field from BAM tags.
711 \param[in] tag 2-character name of field to remove
713 void BamAlignment::RemoveTag(const std::string& tag) {
715 // if char data not populated, do that first
716 if ( SupportData.HasCoreOnly )
719 // skip if no tags available
720 if ( TagData.empty() )
723 // localize the tag data
724 char* pOriginalTagData = (char*)TagData.data();
725 char* pTagData = pOriginalTagData;
726 const unsigned int originalTagDataLength = TagData.size();
727 unsigned int newTagDataLength = 0;
728 unsigned int numBytesParsed = 0;
730 // skip if tag not found
731 if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
734 // otherwise, remove it
735 RaiiBuffer newTagData(originalTagDataLength);
737 // copy original tag data up til desired tag
740 const unsigned int beginningTagDataLength = numBytesParsed;
741 newTagDataLength += beginningTagDataLength;
742 memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
744 // attemp to skip to next tag
745 const char* pTagStorageType = pTagData + 2;
748 if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
750 // squeeze remaining tag data
751 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
752 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
753 memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength );
755 // save modified tag data in alignment
756 TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
760 /*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
763 Sets a formatted error string for this alignment.
765 \param[in] where class/method where error occurred
766 \param[in] what description of error
768 void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const {
769 static const string SEPARATOR = ": ";
770 ErrorString = where + SEPARATOR + what;
773 /*! \fn void BamAlignment::SetIsDuplicate(bool ok)
774 \brief Sets value of "PCR duplicate" flag to \a ok.
776 void BamAlignment::SetIsDuplicate(bool ok) {
777 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE;
778 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE;
781 /*! \fn void BamAlignment::SetIsFailedQC(bool ok)
782 \brief Sets "failed quality control" flag to \a ok.
784 void BamAlignment::SetIsFailedQC(bool ok) {
785 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED;
786 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED;
789 /*! \fn void BamAlignment::SetIsFirstMate(bool ok)
790 \brief Sets "alignment is first mate" flag to \a ok.
792 void BamAlignment::SetIsFirstMate(bool ok) {
793 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1;
794 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1;
797 /*! \fn void BamAlignment::SetIsMapped(bool ok)
798 \brief Sets "alignment is mapped" flag to \a ok.
800 void BamAlignment::SetIsMapped(bool ok) {
801 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED;
802 else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED;
805 /*! \fn void BamAlignment::SetIsMateMapped(bool ok)
806 \brief Sets "alignment's mate is mapped" flag to \a ok.
808 void BamAlignment::SetIsMateMapped(bool ok) {
809 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
810 else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
813 /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
814 \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
816 void BamAlignment::SetIsMateReverseStrand(bool ok) {
817 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
818 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
821 /*! \fn void BamAlignment::SetIsPaired(bool ok)
822 \brief Sets "alignment part of paired-end read" flag to \a ok.
824 void BamAlignment::SetIsPaired(bool ok) {
825 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED;
826 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED;
829 /*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok)
830 \brief Sets "position is primary alignment" flag to \a ok.
832 void BamAlignment::SetIsPrimaryAlignment(bool ok) {
833 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY;
834 else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY;
837 /*! \fn void BamAlignment::SetIsProperPair(bool ok)
838 \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok.
840 void BamAlignment::SetIsProperPair(bool ok) {
841 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR;
842 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR;
845 /*! \fn void BamAlignment::SetIsReverseStrand(bool ok)
846 \brief Sets "alignment mapped to reverse strand" flag to \a ok.
848 void BamAlignment::SetIsReverseStrand(bool ok) {
849 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND;
850 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
853 /*! \fn void BamAlignment::SetIsSecondMate(bool ok)
854 \brief Sets "alignment is second mate on read" flag to \a ok.
856 void BamAlignment::SetIsSecondMate(bool ok) {
857 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2;
858 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
861 /*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
864 Moves to next available tag in tag data string
866 \param[in] storageType BAM tag type-code that determines how far to move cursor
867 \param[in,out] pTagData pointer to current position (cursor) in tag string
868 \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
870 \return \c if storageType was a recognized BAM tag type
872 \post \a pTagData will point to the byte where the next tag data begins.
873 \a numBytesParsed will correspond to the cursor's position in the full TagData string.
875 bool BamAlignment::SkipToNextTag(const char storageType,
877 unsigned int& numBytesParsed) const
879 switch (storageType) {
881 case (Constants::BAM_TAG_TYPE_ASCII) :
882 case (Constants::BAM_TAG_TYPE_INT8) :
883 case (Constants::BAM_TAG_TYPE_UINT8) :
888 case (Constants::BAM_TAG_TYPE_INT16) :
889 case (Constants::BAM_TAG_TYPE_UINT16) :
890 numBytesParsed += sizeof(uint16_t);
891 pTagData += sizeof(uint16_t);
894 case (Constants::BAM_TAG_TYPE_FLOAT) :
895 case (Constants::BAM_TAG_TYPE_INT32) :
896 case (Constants::BAM_TAG_TYPE_UINT32) :
897 numBytesParsed += sizeof(uint32_t);
898 pTagData += sizeof(uint32_t);
901 case (Constants::BAM_TAG_TYPE_STRING) :
902 case (Constants::BAM_TAG_TYPE_HEX) :
907 // increment for null-terminator
912 case (Constants::BAM_TAG_TYPE_ARRAY) :
916 const char arrayType = *pTagData;
920 // read number of elements
922 memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
923 numBytesParsed += sizeof(uint32_t);
924 pTagData += sizeof(uint32_t);
926 // calculate number of bytes to skip
929 case (Constants::BAM_TAG_TYPE_INT8) :
930 case (Constants::BAM_TAG_TYPE_UINT8) :
931 bytesToSkip = numElements;
933 case (Constants::BAM_TAG_TYPE_INT16) :
934 case (Constants::BAM_TAG_TYPE_UINT16) :
935 bytesToSkip = numElements*sizeof(uint16_t);
937 case (Constants::BAM_TAG_TYPE_FLOAT) :
938 case (Constants::BAM_TAG_TYPE_INT32) :
939 case (Constants::BAM_TAG_TYPE_UINT32) :
940 bytesToSkip = numElements*sizeof(uint32_t);
943 const string message = string("invalid binary array type: ") + arrayType;
944 SetErrorString("BamAlignment::SkipToNextTag", message);
948 // skip binary array contents
949 numBytesParsed += bytesToSkip;
950 pTagData += bytesToSkip;
955 const string message = string("invalid tag type: ") + storageType;
956 SetErrorString("BamAlignment::SkipToNextTag", message);
960 // if we get here, tag skipped OK - return success