1 // ***************************************************************************
2 // BamAlignment.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 13 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/BamConstants.h"
12 using namespace BamTools;
15 /*! \class BamTools::BamAlignment
16 \brief The main BAM alignment data structure.
18 Provides methods to query/modify BAM alignment data fields.
20 /*! \var BamAlignment::Name
23 /*! \var BamAlignment::Length
24 \brief length of query sequence
26 /*! \var BamAlignment::QueryBases
27 \brief 'original' sequence (as reported from sequencing machine)
29 /*! \var BamAlignment::AlignedBases
30 \brief 'aligned' sequence (includes any indels, padding, clipping)
32 /*! \var BamAlignment::Qualities
33 \brief FASTQ qualities (ASCII characters, not numeric values)
35 /*! \var BamAlignment::TagData
36 \brief tag data (use the provided methods to query/modify)
38 /*! \var BamAlignment::RefID
39 \brief ID number for reference sequence
41 /*! \var BamAlignment::Position
42 \brief position (0-based) where alignment starts
44 /*! \var BamAlignment::Bin
45 \brief BAM (standard) index bin number for this alignment
47 /*! \var BamAlignment::MapQuality
48 \brief mapping quality score
50 /*! \var BamAlignment::AlignmentFlag
51 \brief alignment bit-flag (use the provided methods to query/modify)
53 /*! \var BamAlignment::CigarData
54 \brief CIGAR operations for this alignment
56 /*! \var BamAlignment::MateRefID
57 \brief ID number for reference sequence where alignment's mate was aligned
59 /*! \var BamAlignment::MatePosition
60 \brief position (0-based) where alignment's mate starts
62 /*! \var BamAlignment::InsertSize
63 \brief mate-pair insert size
65 /*! \var BamAlignment::Filename
66 \brief name of BAM file which this alignment comes from
69 /*! \fn BamAlignment::BamAlignment(void)
72 BamAlignment::BamAlignment(void)
80 /*! \fn BamAlignment::BamAlignment(const BamAlignment& other)
81 \brief copy constructor
83 BamAlignment::BamAlignment(const BamAlignment& other)
85 , Length(other.Length)
86 , QueryBases(other.QueryBases)
87 , AlignedBases(other.AlignedBases)
88 , Qualities(other.Qualities)
89 , TagData(other.TagData)
91 , Position(other.Position)
93 , MapQuality(other.MapQuality)
94 , AlignmentFlag(other.AlignmentFlag)
95 , CigarData(other.CigarData)
96 , MateRefID(other.MateRefID)
97 , MatePosition(other.MatePosition)
98 , InsertSize(other.InsertSize)
99 , Filename(other.Filename)
100 , SupportData(other.SupportData)
103 /*! \fn BamAlignment::~BamAlignment(void)
106 BamAlignment::~BamAlignment(void) { }
108 /*! \fn bool BamAlignment::BuildCharData(void)
109 \brief Populates alignment string fields (read name, bases, qualities, tag data).
111 An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
112 Using that method makes parsing much quicker when only positional data is required.
114 However, if you later want to access the character data fields from such an alignment,
115 use this method to populate those fields. Provides ability to do 'lazy evaluation' of
118 \return \c true if character data populated successfully (or was already available to begin with)
120 bool BamAlignment::BuildCharData(void) {
122 // skip if char data already parsed
123 if ( !SupportData.HasCoreOnly )
126 // check system endianness
127 bool IsBigEndian = BamTools::SystemIsBigEndian();
129 // calculate character lengths/offsets
130 const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
131 const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4);
132 const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
133 const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
134 const unsigned int tagDataLength = dataLength - tagDataOffset;
136 // check offsets to see what char data exists
137 const bool hasSeqData = ( seqDataOffset < dataLength );
138 const bool hasQualData = ( qualDataOffset < dataLength );
139 const bool hasTagData = ( tagDataOffset < dataLength );
141 // set up char buffers
142 const char* allCharData = SupportData.AllCharData.data();
143 const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
144 const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
145 char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
147 // store alignment name (relies on null char in name as terminator)
148 Name.assign((const char*)(allCharData));
150 // save query sequence
153 QueryBases.reserve(SupportData.QuerySequenceLength);
154 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
155 const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
156 QueryBases.append(1, singleBase);
160 // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
163 Qualities.reserve(SupportData.QuerySequenceLength);
164 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
165 const char singleQuality = static_cast<const char>(qualData[i]+33);
166 Qualities.append(1, singleQuality);
170 // clear previous AlignedBases
171 AlignedBases.clear();
173 // if QueryBases has data, build AlignedBases using CIGAR data
174 // otherwise, AlignedBases will remain empty (this case IS allowed)
175 if ( !QueryBases.empty() ) {
177 // resize AlignedBases
178 AlignedBases.reserve(SupportData.QuerySequenceLength);
180 // iterate over CigarOps
182 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
183 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
184 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
185 const CigarOp& op = (*cigarIter);
189 // for 'M', 'I', '=', 'X' - write bases
190 case (Constants::BAM_CIGAR_MATCH_CHAR) :
191 case (Constants::BAM_CIGAR_INS_CHAR) :
192 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) :
193 case (Constants::BAM_CIGAR_MISMATCH_CHAR) :
194 AlignedBases.append(QueryBases.substr(k, op.Length));
197 // for 'S' - soft clip, do not write bases
198 // but increment placeholder 'k'
199 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) :
203 // for 'D' - write gap character
204 case (Constants::BAM_CIGAR_DEL_CHAR) :
205 AlignedBases.append(op.Length, Constants::BAM_DNA_DEL);
208 // for 'P' - write padding character
209 case (Constants::BAM_CIGAR_PAD_CHAR) :
210 AlignedBases.append( op.Length, Constants::BAM_DNA_PAD );
213 // for 'N' - write N's, skip bases in original query sequence
214 case (Constants::BAM_CIGAR_REFSKIP_CHAR) :
215 AlignedBases.append( op.Length, Constants::BAM_DNA_N );
218 // for 'H' - hard clip, do nothing to AlignedBases, move to next op
219 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) :
222 // invalid CIGAR op-code
224 const string message = string("invalid CIGAR operation type: ") + op.Type;
225 SetErrorString("BamAlignment::BuildCharData", message);
236 while ( i < tagDataLength ) {
238 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
239 const char type = tagData[i]; // get tag type at position i
240 ++i; // move i past tag type
244 case(Constants::BAM_TAG_TYPE_ASCII) :
245 case(Constants::BAM_TAG_TYPE_INT8) :
246 case(Constants::BAM_TAG_TYPE_UINT8) :
247 // no endian swapping necessary for single-byte data
251 case(Constants::BAM_TAG_TYPE_INT16) :
252 case(Constants::BAM_TAG_TYPE_UINT16) :
253 BamTools::SwapEndian_16p(&tagData[i]);
254 i += sizeof(uint16_t);
257 case(Constants::BAM_TAG_TYPE_FLOAT) :
258 case(Constants::BAM_TAG_TYPE_INT32) :
259 case(Constants::BAM_TAG_TYPE_UINT32) :
260 BamTools::SwapEndian_32p(&tagData[i]);
261 i += sizeof(uint32_t);
264 case(Constants::BAM_TAG_TYPE_HEX) :
265 case(Constants::BAM_TAG_TYPE_STRING) :
266 // no endian swapping necessary for hex-string/string data
269 // increment one more for null terminator
273 case(Constants::BAM_TAG_TYPE_ARRAY) :
277 const char arrayType = tagData[i];
280 // swap endian-ness of number of elements in place, then retrieve for loop
281 BamTools::SwapEndian_32p(&tagData[i]);
282 uint32_t numElements;
283 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
284 i += sizeof(uint32_t);
286 // swap endian-ness of array elements
287 for ( size_t j = 0; j < numElements; ++j ) {
289 case (Constants::BAM_TAG_TYPE_INT8) :
290 case (Constants::BAM_TAG_TYPE_UINT8) :
291 // no endian-swapping necessary
294 case (Constants::BAM_TAG_TYPE_INT16) :
295 case (Constants::BAM_TAG_TYPE_UINT16) :
296 BamTools::SwapEndian_16p(&tagData[i]);
297 i += sizeof(uint16_t);
299 case (Constants::BAM_TAG_TYPE_FLOAT) :
300 case (Constants::BAM_TAG_TYPE_INT32) :
301 case (Constants::BAM_TAG_TYPE_UINT32) :
302 BamTools::SwapEndian_32p(&tagData[i]);
303 i += sizeof(uint32_t);
306 const string message = string("invalid binary array type: ") + arrayType;
307 SetErrorString("BamAlignment::BuildCharData", message);
315 // invalid tag type-code
317 const string message = string("invalid tag type: ") + type;
318 SetErrorString("BamAlignment::BuildCharData", message);
324 // store tagData in alignment
325 TagData.resize(tagDataLength);
326 memcpy((char*)(TagData.data()), tagData, tagDataLength);
329 // clear core-only flag & return success
330 SupportData.HasCoreOnly = false;
334 /*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
337 Searches for requested tag in BAM tag data.
339 \param[in] tag requested 2-character tag name
340 \param[in,out] pTagData pointer to current position in BamAlignment::TagData
341 \param[in] tagDataLength length of BamAlignment::TagData
342 \param[in,out] numBytesParsed number of bytes parsed so far
344 \return \c true if found
346 \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
347 \a numBytesParsed will correspond to the position in the full TagData string.
350 bool BamAlignment::FindTag(const std::string& tag,
352 const unsigned int& tagDataLength,
353 unsigned int& numBytesParsed) const
356 while ( numBytesParsed < tagDataLength ) {
358 const char* pTagType = pTagData;
359 const char* pTagStorageType = pTagData + 2;
363 // check the current tag, return true on match
364 if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
367 // get the storage class and find the next tag
368 if ( *pTagStorageType == '\0' ) {
369 ErrorString = "unexpected null found - 1";
372 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
373 ErrorString = "could not skip to next tag";
376 if ( *pTagData == '\0' ) {
377 ErrorString = "unexpected null found - 2";
382 // checked all tags, none match
386 /*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
387 \brief Calculates alignment end position, based on its starting position and CIGAR data.
389 \warning The position returned now represents a zero-based, HALF-OPEN interval.
390 In previous versions of BamTools (0.x & 1.x) all intervals were treated
391 as zero-based, CLOSED.
393 \param[in] usePadded Allow inserted bases to affect the reported position. Default is
394 false, so that reported position stays synced with reference
396 \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
397 false, so that his value represents a standard, half-open interval.
399 \return alignment end position
401 int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
403 // initialize alignment end to starting position
404 int alignEnd = Position;
406 // iterate over cigar operations
407 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
408 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
409 for ( ; cigarIter != cigarEnd; ++cigarIter) {
410 const CigarOp& op = (*cigarIter);
414 // increase end position on CIGAR chars [DMXN=]
415 case Constants::BAM_CIGAR_DEL_CHAR :
416 case Constants::BAM_CIGAR_MATCH_CHAR :
417 case Constants::BAM_CIGAR_MISMATCH_CHAR :
418 case Constants::BAM_CIGAR_REFSKIP_CHAR :
419 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
420 alignEnd += op.Length;
423 // increase end position on insertion, only if @usePadded is true
424 case Constants::BAM_CIGAR_INS_CHAR :
426 alignEnd += op.Length;
429 // all other CIGAR chars do not affect end position
435 // adjust for closedInterval, if requested
436 if ( closedInterval )
443 /*! \fn std::string BamAlignment::GetErrorString(void) const
444 \brief Returns a human-readable description of the last error that occurred
446 This method allows elimination of STDERR pollution. Developers of client code
447 may choose how the messages are displayed to the user, if at all.
449 \return error description
451 std::string BamAlignment::GetErrorString(void) const {
455 /*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
456 \brief Identifies if an alignment has a soft clip. If so, identifies the
457 sizes of the soft clips, as well as their positions in the read and reference.
459 \param[out] clipSizes vector of the sizes of each soft clip in the alignment
460 \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
461 These positions are basically indexes within the read, not genomic positions.
462 \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
463 \param[in] usePadded inserted bases affect reported position. Default is false, so that
464 reported position stays 'sync-ed' with reference coordinates.
466 \return \c true if any soft clips were found in the alignment
468 bool BamAlignment::GetSoftClips(vector<int>& clipSizes,
469 vector<int>& readPositions,
470 vector<int>& genomePositions,
471 bool usePadded) const
473 // initialize positions & flags
474 int refPosition = Position;
475 int readPosition = 0;
476 bool softClipFound = false;
477 bool firstCigarOp = true;
479 // iterate over cigar operations
480 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
481 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
482 for ( ; cigarIter != cigarEnd; ++cigarIter) {
483 const CigarOp& op = (*cigarIter);
487 // increase both read & genome positions on CIGAR chars [DMXN=]
488 case Constants::BAM_CIGAR_DEL_CHAR :
489 case Constants::BAM_CIGAR_MATCH_CHAR :
490 case Constants::BAM_CIGAR_MISMATCH_CHAR :
491 case Constants::BAM_CIGAR_REFSKIP_CHAR :
492 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
493 refPosition += op.Length;
494 readPosition += op.Length;
497 // increase read position on insertion, genome position only if @usePadded is true
498 case Constants::BAM_CIGAR_INS_CHAR :
499 readPosition += op.Length;
501 refPosition += op.Length;
504 case Constants::BAM_CIGAR_SOFTCLIP_CHAR :
506 softClipFound = true;
508 //////////////////////////////////////////////////////////////////////////////
509 // if we are dealing with the *first* CIGAR operation
510 // for this alignment, we increment the read position so that
511 // the read and genome position of the clip are referring to the same base.
512 // For example, in the alignment below, the ref position would be 4, yet
513 // the read position would be 0. Thus, to "sync" the two,
514 // we need to increment the read position by the length of the
516 // Read: ATCGTTTCGTCCCTGC
517 // Ref: GGGATTTCGTCCCTGC
518 // Cigar: SSSSMMMMMMMMMMMM
520 // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
521 //////////////////////////////////////////////////////////////////////////////
523 readPosition += op.Length;
525 // track the soft clip's size, read position, and genome position
526 clipSizes.push_back(op.Length);
527 readPositions.push_back(readPosition);
528 genomePositions.push_back(refPosition);
530 // any other CIGAR operations have no effect
535 // clear our "first pass" flag
536 firstCigarOp = false;
539 // return whether any soft clips found
540 return softClipFound;
543 /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
544 \brief Retrieves the BAM tag type-code associated with requested tag name.
546 \param[in] tag 2-character tag name
547 \param[out] type retrieved (1-character) type-code
549 \return \c true if found
550 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
552 bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
554 // skip if alignment is core-only
555 if ( SupportData.HasCoreOnly ) {
556 // TODO: set error string?
560 // skip if no tags present
561 if ( TagData.empty() ) {
562 // TODO: set error string?
566 // localize the tag data
567 char* pTagData = (char*)TagData.data();
568 const unsigned int tagDataLength = TagData.size();
569 unsigned int numBytesParsed = 0;
571 // if tag not found, return failure
572 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
573 // TODO: set error string?
577 // otherwise, retrieve & validate tag type code
578 type = *(pTagData - 1);
580 case (Constants::BAM_TAG_TYPE_ASCII) :
581 case (Constants::BAM_TAG_TYPE_INT8) :
582 case (Constants::BAM_TAG_TYPE_UINT8) :
583 case (Constants::BAM_TAG_TYPE_INT16) :
584 case (Constants::BAM_TAG_TYPE_UINT16) :
585 case (Constants::BAM_TAG_TYPE_INT32) :
586 case (Constants::BAM_TAG_TYPE_UINT32) :
587 case (Constants::BAM_TAG_TYPE_FLOAT) :
588 case (Constants::BAM_TAG_TYPE_STRING) :
589 case (Constants::BAM_TAG_TYPE_HEX) :
590 case (Constants::BAM_TAG_TYPE_ARRAY) :
595 const string message = string("invalid tag type: ") + type;
596 SetErrorString("BamAlignment::GetTagType", message);
601 /*! \fn bool BamAlignment::HasTag(const std::string& tag) const
602 \brief Returns true if alignment has a record for requested tag.
604 \param[in] tag 2-character tag name
605 \return \c true if alignment has a record for tag
607 bool BamAlignment::HasTag(const std::string& tag) const {
609 // return false if no tag data present
610 if ( SupportData.HasCoreOnly || TagData.empty() )
613 // localize the tag data for lookup
614 char* pTagData = (char*)TagData.data();
615 const unsigned int tagDataLength = TagData.size();
616 unsigned int numBytesParsed = 0;
618 // if result of tag lookup
619 return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
622 /*! \fn bool BamAlignment::IsDuplicate(void) const
623 \return \c true if this read is a PCR duplicate
625 bool BamAlignment::IsDuplicate(void) const {
626 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 );
629 /*! \fn bool BamAlignment::IsFailedQC(void) const
630 \return \c true if this read failed quality control
632 bool BamAlignment::IsFailedQC(void) const {
633 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 );
636 /*! \fn bool BamAlignment::IsFirstMate(void) const
637 \return \c true if alignment is first mate on paired-end read
639 bool BamAlignment::IsFirstMate(void) const {
640 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 );
643 /*! \fn bool BamAlignment::IsMapped(void) const
644 \return \c true if alignment is mapped
646 bool BamAlignment::IsMapped(void) const {
647 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 );
650 /*! \fn bool BamAlignment::IsMateMapped(void) const
651 \return \c true if alignment's mate is mapped
653 bool BamAlignment::IsMateMapped(void) const {
654 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 );
657 /*! \fn bool BamAlignment::IsMateReverseStrand(void) const
658 \return \c true if alignment's mate mapped to reverse strand
660 bool BamAlignment::IsMateReverseStrand(void) const {
661 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 );
664 /*! \fn bool BamAlignment::IsPaired(void) const
665 \return \c true if alignment part of paired-end read
667 bool BamAlignment::IsPaired(void) const {
668 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 );
671 /*! \fn bool BamAlignment::IsPrimaryAlignment(void) const
672 \return \c true if reported position is primary alignment
674 bool BamAlignment::IsPrimaryAlignment(void) const {
675 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 );
678 /*! \fn bool BamAlignment::IsProperPair(void) const
679 \return \c true if alignment is part of read that satisfied paired-end resolution
681 bool BamAlignment::IsProperPair(void) const {
682 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 );
685 /*! \fn bool BamAlignment::IsReverseStrand(void) const
686 \return \c true if alignment mapped to reverse strand
688 bool BamAlignment::IsReverseStrand(void) const {
689 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 );
692 /*! \fn bool BamAlignment::IsSecondMate(void) const
693 \return \c true if alignment is second mate on read
695 bool BamAlignment::IsSecondMate(void) const {
696 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
699 /*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
702 Checks that tag name & type strings are expected sizes.
704 \param tag[in] BAM tag name
705 \param type[in] BAM tag type-code
706 \return \c true if both input strings are valid sizes
708 bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const {
709 return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
710 (type.size() == Constants::BAM_TAG_TYPESIZE);
713 /*! \fn void BamAlignment::RemoveTag(const std::string& tag)
714 \brief Removes field from BAM tags.
716 \param[in] tag 2-character name of field to remove
718 void BamAlignment::RemoveTag(const std::string& tag) {
720 // if char data not populated, do that first
721 if ( SupportData.HasCoreOnly )
724 // skip if no tags available
725 if ( TagData.empty() )
728 // localize the tag data
729 char* pOriginalTagData = (char*)TagData.data();
730 char* pTagData = pOriginalTagData;
731 const unsigned int originalTagDataLength = TagData.size();
732 unsigned int newTagDataLength = 0;
733 unsigned int numBytesParsed = 0;
735 // skip if tag not found
736 if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
739 // otherwise, remove it
740 RaiiBuffer newTagData(originalTagDataLength);
742 // copy original tag data up til desired tag
745 const unsigned int beginningTagDataLength = numBytesParsed;
746 newTagDataLength += beginningTagDataLength;
747 memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
749 // attemp to skip to next tag
750 const char* pTagStorageType = pTagData + 2;
753 if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
755 // squeeze remaining tag data
756 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
757 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
758 memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength );
760 // save modified tag data in alignment
761 TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
765 /*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
768 Sets a formatted error string for this alignment.
770 \param[in] where class/method where error occurred
771 \param[in] what description of error
773 void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const {
774 static const string SEPARATOR = ": ";
775 ErrorString = where + SEPARATOR + what;
778 /*! \fn void BamAlignment::SetIsDuplicate(bool ok)
779 \brief Sets value of "PCR duplicate" flag to \a ok.
781 void BamAlignment::SetIsDuplicate(bool ok) {
782 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE;
783 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE;
786 /*! \fn void BamAlignment::SetIsFailedQC(bool ok)
787 \brief Sets "failed quality control" flag to \a ok.
789 void BamAlignment::SetIsFailedQC(bool ok) {
790 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED;
791 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED;
794 /*! \fn void BamAlignment::SetIsFirstMate(bool ok)
795 \brief Sets "alignment is first mate" flag to \a ok.
797 void BamAlignment::SetIsFirstMate(bool ok) {
798 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1;
799 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1;
802 /*! \fn void BamAlignment::SetIsMapped(bool ok)
803 \brief Sets "alignment is mapped" flag to \a ok.
805 void BamAlignment::SetIsMapped(bool ok) {
806 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED;
807 else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED;
810 /*! \fn void BamAlignment::SetIsMateMapped(bool ok)
811 \brief Sets "alignment's mate is mapped" flag to \a ok.
813 void BamAlignment::SetIsMateMapped(bool ok) {
814 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
815 else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
818 /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
819 \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
821 void BamAlignment::SetIsMateReverseStrand(bool ok) {
822 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
823 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
826 /*! \fn void BamAlignment::SetIsPaired(bool ok)
827 \brief Sets "alignment part of paired-end read" flag to \a ok.
829 void BamAlignment::SetIsPaired(bool ok) {
830 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED;
831 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED;
834 /*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok)
835 \brief Sets "position is primary alignment" flag to \a ok.
837 void BamAlignment::SetIsPrimaryAlignment(bool ok) {
838 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY;
839 else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY;
842 /*! \fn void BamAlignment::SetIsProperPair(bool ok)
843 \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok.
845 void BamAlignment::SetIsProperPair(bool ok) {
846 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR;
847 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR;
850 /*! \fn void BamAlignment::SetIsReverseStrand(bool ok)
851 \brief Sets "alignment mapped to reverse strand" flag to \a ok.
853 void BamAlignment::SetIsReverseStrand(bool ok) {
854 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND;
855 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
858 /*! \fn void BamAlignment::SetIsSecondMate(bool ok)
859 \brief Sets "alignment is second mate on read" flag to \a ok.
861 void BamAlignment::SetIsSecondMate(bool ok) {
862 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2;
863 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
866 /*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
869 Moves to next available tag in tag data string
871 \param[in] storageType BAM tag type-code that determines how far to move cursor
872 \param[in,out] pTagData pointer to current position (cursor) in tag string
873 \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
875 \return \c if storageType was a recognized BAM tag type
877 \post \a pTagData will point to the byte where the next tag data begins.
878 \a numBytesParsed will correspond to the cursor's position in the full TagData string.
880 bool BamAlignment::SkipToNextTag(const char storageType,
882 unsigned int& numBytesParsed) const
884 switch (storageType) {
886 case (Constants::BAM_TAG_TYPE_ASCII) :
887 case (Constants::BAM_TAG_TYPE_INT8) :
888 case (Constants::BAM_TAG_TYPE_UINT8) :
893 case (Constants::BAM_TAG_TYPE_INT16) :
894 case (Constants::BAM_TAG_TYPE_UINT16) :
895 numBytesParsed += sizeof(uint16_t);
896 pTagData += sizeof(uint16_t);
899 case (Constants::BAM_TAG_TYPE_FLOAT) :
900 case (Constants::BAM_TAG_TYPE_INT32) :
901 case (Constants::BAM_TAG_TYPE_UINT32) :
902 numBytesParsed += sizeof(uint32_t);
903 pTagData += sizeof(uint32_t);
906 case (Constants::BAM_TAG_TYPE_STRING) :
907 case (Constants::BAM_TAG_TYPE_HEX) :
912 // increment for null-terminator
917 case (Constants::BAM_TAG_TYPE_ARRAY) :
921 const char arrayType = *pTagData;
925 // read number of elements
927 memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
928 numBytesParsed += sizeof(uint32_t);
929 pTagData += sizeof(uint32_t);
931 // calculate number of bytes to skip
934 case (Constants::BAM_TAG_TYPE_INT8) :
935 case (Constants::BAM_TAG_TYPE_UINT8) :
936 bytesToSkip = numElements;
938 case (Constants::BAM_TAG_TYPE_INT16) :
939 case (Constants::BAM_TAG_TYPE_UINT16) :
940 bytesToSkip = numElements*sizeof(uint16_t);
942 case (Constants::BAM_TAG_TYPE_FLOAT) :
943 case (Constants::BAM_TAG_TYPE_INT32) :
944 case (Constants::BAM_TAG_TYPE_UINT32) :
945 bytesToSkip = numElements*sizeof(uint32_t);
948 const string message = string("invalid binary array type: ") + arrayType;
949 SetErrorString("BamAlignment::SkipToNextTag", message);
953 // skip binary array contents
954 numBytesParsed += bytesToSkip;
955 pTagData += bytesToSkip;
960 const string message = string("invalid tag type: ") + storageType;
961 SetErrorString("BamAlignment::SkipToNextTag", message);
965 // if we get here, tag skipped OK - return success