1 // ***************************************************************************
2 // BamAlignment.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 4 April 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/BamConstants.h"
12 using namespace BamTools;
15 /*! \class BamTools::BamAlignment
16 \brief The main BAM alignment data structure.
18 Provides methods to query/modify BAM alignment data fields.
20 /*! \var BamAlignment::Name
23 /*! \var BamAlignment::Length
24 \brief length of query sequence
26 /*! \var BamAlignment::QueryBases
27 \brief 'original' sequence (as reported from sequencing machine)
29 \note Setting this field to "*" indicates that the sequence is not to be stored on output.
30 In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*").
32 /*! \var BamAlignment::AlignedBases
33 \brief 'aligned' sequence (includes any indels, padding, clipping)
35 This field will be completely empty after reading from BamReader/BamMultiReader when
38 /*! \var BamAlignment::Qualities
39 \brief FASTQ qualities (ASCII characters, not numeric values)
41 \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored,
42 but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and
43 the length of this field should equal the length of QueryBases.
45 /*! \var BamAlignment::TagData
46 \brief tag data (use the provided methods to query/modify)
48 /*! \var BamAlignment::RefID
49 \brief ID number for reference sequence
51 /*! \var BamAlignment::Position
52 \brief position (0-based) where alignment starts
54 /*! \var BamAlignment::Bin
55 \brief BAM (standard) index bin number for this alignment
57 /*! \var BamAlignment::MapQuality
58 \brief mapping quality score
60 /*! \var BamAlignment::AlignmentFlag
61 \brief alignment bit-flag (use the provided methods to query/modify)
63 /*! \var BamAlignment::CigarData
64 \brief CIGAR operations for this alignment
66 /*! \var BamAlignment::MateRefID
67 \brief ID number for reference sequence where alignment's mate was aligned
69 /*! \var BamAlignment::MatePosition
70 \brief position (0-based) where alignment's mate starts
72 /*! \var BamAlignment::InsertSize
73 \brief mate-pair insert size
75 /*! \var BamAlignment::Filename
76 \brief name of BAM file which this alignment comes from
79 /*! \fn BamAlignment::BamAlignment(void)
82 BamAlignment::BamAlignment(void)
94 /*! \fn BamAlignment::BamAlignment(const BamAlignment& other)
95 \brief copy constructor
97 BamAlignment::BamAlignment(const BamAlignment& other)
99 , Length(other.Length)
100 , QueryBases(other.QueryBases)
101 , AlignedBases(other.AlignedBases)
102 , Qualities(other.Qualities)
103 , TagData(other.TagData)
105 , Position(other.Position)
107 , MapQuality(other.MapQuality)
108 , AlignmentFlag(other.AlignmentFlag)
109 , CigarData(other.CigarData)
110 , MateRefID(other.MateRefID)
111 , MatePosition(other.MatePosition)
112 , InsertSize(other.InsertSize)
113 , Filename(other.Filename)
114 , SupportData(other.SupportData)
117 /*! \fn BamAlignment::~BamAlignment(void)
120 BamAlignment::~BamAlignment(void) { }
122 /*! \fn bool BamAlignment::BuildCharData(void)
123 \brief Populates alignment string fields (read name, bases, qualities, tag data).
125 An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
126 Using that method makes parsing much quicker when only positional data is required.
128 However, if you later want to access the character data fields from such an alignment,
129 use this method to populate those fields. Provides ability to do 'lazy evaluation' of
132 \return \c true if character data populated successfully (or was already available to begin with)
134 bool BamAlignment::BuildCharData(void) {
136 // skip if char data already parsed
137 if ( !SupportData.HasCoreOnly )
140 // check system endianness
141 bool IsBigEndian = BamTools::SystemIsBigEndian();
143 // calculate character lengths/offsets
144 const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
145 const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4);
146 const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
147 const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
148 const unsigned int tagDataLength = dataLength - tagDataOffset;
150 // check offsets to see what char data exists
151 const bool hasSeqData = ( seqDataOffset < qualDataOffset );
152 const bool hasQualData = ( qualDataOffset < tagDataOffset );
153 const bool hasTagData = ( tagDataOffset < dataLength );
155 // store alignment name (relies on null char in name as terminator)
156 Name.assign(SupportData.AllCharData.data());
158 // save query sequence
161 const char* seqData = SupportData.AllCharData.data() + seqDataOffset;
162 QueryBases.reserve(SupportData.QuerySequenceLength);
163 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
164 const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
165 QueryBases.append(1, singleBase);
173 const char* qualData = SupportData.AllCharData.data() + qualDataOffset;
175 // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs
176 if ( qualData[0] == (char)0xFF )
177 Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF);
179 // otherwise convert from numeric QV to 'FASTQ-style' ASCII character
181 Qualities.reserve(SupportData.QuerySequenceLength);
182 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i )
183 Qualities.append(1, qualData[i]+33);
187 // clear previous AlignedBases
188 AlignedBases.clear();
190 // if QueryBases has data, build AlignedBases using CIGAR data
191 // otherwise, AlignedBases will remain empty (this case IS allowed)
192 if ( !QueryBases.empty() && QueryBases != "*" ) {
194 // resize AlignedBases
195 AlignedBases.reserve(SupportData.QuerySequenceLength);
197 // iterate over CigarOps
199 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
200 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
201 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
202 const CigarOp& op = (*cigarIter);
206 // for 'M', 'I', '=', 'X' - write bases
207 case (Constants::BAM_CIGAR_MATCH_CHAR) :
208 case (Constants::BAM_CIGAR_INS_CHAR) :
209 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) :
210 case (Constants::BAM_CIGAR_MISMATCH_CHAR) :
211 AlignedBases.append(QueryBases.substr(k, op.Length));
214 // for 'S' - soft clip, do not write bases
215 // but increment placeholder 'k'
216 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) :
220 // for 'D' - write gap character
221 case (Constants::BAM_CIGAR_DEL_CHAR) :
222 AlignedBases.append(op.Length, Constants::BAM_DNA_DEL);
225 // for 'P' - write padding character
226 case (Constants::BAM_CIGAR_PAD_CHAR) :
227 AlignedBases.append( op.Length, Constants::BAM_DNA_PAD );
230 // for 'N' - write N's, skip bases in original query sequence
231 case (Constants::BAM_CIGAR_REFSKIP_CHAR) :
232 AlignedBases.append( op.Length, Constants::BAM_DNA_N );
235 // for 'H' - hard clip, do nothing to AlignedBases, move to next op
236 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) :
239 // invalid CIGAR op-code
241 const string message = string("invalid CIGAR operation type: ") + op.Type;
242 SetErrorString("BamAlignment::BuildCharData", message);
252 char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset);
256 while ( i < tagDataLength ) {
258 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
259 const char type = tagData[i]; // get tag type at position i
260 ++i; // move i past tag type
264 case(Constants::BAM_TAG_TYPE_ASCII) :
265 case(Constants::BAM_TAG_TYPE_INT8) :
266 case(Constants::BAM_TAG_TYPE_UINT8) :
267 // no endian swapping necessary for single-byte data
271 case(Constants::BAM_TAG_TYPE_INT16) :
272 case(Constants::BAM_TAG_TYPE_UINT16) :
273 BamTools::SwapEndian_16p(&tagData[i]);
274 i += sizeof(uint16_t);
277 case(Constants::BAM_TAG_TYPE_FLOAT) :
278 case(Constants::BAM_TAG_TYPE_INT32) :
279 case(Constants::BAM_TAG_TYPE_UINT32) :
280 BamTools::SwapEndian_32p(&tagData[i]);
281 i += sizeof(uint32_t);
284 case(Constants::BAM_TAG_TYPE_HEX) :
285 case(Constants::BAM_TAG_TYPE_STRING) :
286 // no endian swapping necessary for hex-string/string data
289 // increment one more for null terminator
293 case(Constants::BAM_TAG_TYPE_ARRAY) :
297 const char arrayType = tagData[i];
300 // swap endian-ness of number of elements in place, then retrieve for loop
301 BamTools::SwapEndian_32p(&tagData[i]);
302 uint32_t numElements;
303 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
304 i += sizeof(uint32_t);
306 // swap endian-ness of array elements
307 for ( size_t j = 0; j < numElements; ++j ) {
309 case (Constants::BAM_TAG_TYPE_INT8) :
310 case (Constants::BAM_TAG_TYPE_UINT8) :
311 // no endian-swapping necessary
314 case (Constants::BAM_TAG_TYPE_INT16) :
315 case (Constants::BAM_TAG_TYPE_UINT16) :
316 BamTools::SwapEndian_16p(&tagData[i]);
317 i += sizeof(uint16_t);
319 case (Constants::BAM_TAG_TYPE_FLOAT) :
320 case (Constants::BAM_TAG_TYPE_INT32) :
321 case (Constants::BAM_TAG_TYPE_UINT32) :
322 BamTools::SwapEndian_32p(&tagData[i]);
323 i += sizeof(uint32_t);
326 const string message = string("invalid binary array type: ") + arrayType;
327 SetErrorString("BamAlignment::BuildCharData", message);
335 // invalid tag type-code
337 const string message = string("invalid tag type: ") + type;
338 SetErrorString("BamAlignment::BuildCharData", message);
344 // store tagData in alignment
345 TagData.resize(tagDataLength);
346 memcpy((char*)(TagData.data()), tagData, tagDataLength);
349 // clear core-only flag & return success
350 SupportData.HasCoreOnly = false;
354 /*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
357 Searches for requested tag in BAM tag data.
359 \param[in] tag requested 2-character tag name
360 \param[in,out] pTagData pointer to current position in BamAlignment::TagData
361 \param[in] tagDataLength length of BamAlignment::TagData
362 \param[in,out] numBytesParsed number of bytes parsed so far
364 \return \c true if found
366 \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
367 \a numBytesParsed will correspond to the position in the full TagData string.
370 bool BamAlignment::FindTag(const std::string& tag,
372 const unsigned int& tagDataLength,
373 unsigned int& numBytesParsed) const
376 while ( numBytesParsed < tagDataLength ) {
378 const char* pTagType = pTagData;
379 const char* pTagStorageType = pTagData + 2;
383 // check the current tag, return true on match
384 if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
387 // get the storage class and find the next tag
388 if ( *pTagStorageType == '\0' ) return false;
389 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
390 if ( *pTagData == '\0' ) return false;
393 // checked all tags, none match
397 /*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
398 \brief Calculates alignment end position, based on its starting position and CIGAR data.
400 \warning The position returned now represents a zero-based, HALF-OPEN interval.
401 In previous versions of BamTools (0.x & 1.x) all intervals were treated
402 as zero-based, CLOSED.
404 \param[in] usePadded Allow inserted bases to affect the reported position. Default is
405 false, so that reported position stays synced with reference
407 \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
408 false, so that his value represents a standard, half-open interval.
410 \return alignment end position
412 int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
414 // initialize alignment end to starting position
415 int alignEnd = Position;
417 // iterate over cigar operations
418 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
419 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
420 for ( ; cigarIter != cigarEnd; ++cigarIter) {
421 const CigarOp& op = (*cigarIter);
425 // increase end position on CIGAR chars [DMXN=]
426 case Constants::BAM_CIGAR_DEL_CHAR :
427 case Constants::BAM_CIGAR_MATCH_CHAR :
428 case Constants::BAM_CIGAR_MISMATCH_CHAR :
429 case Constants::BAM_CIGAR_REFSKIP_CHAR :
430 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
431 alignEnd += op.Length;
434 // increase end position on insertion, only if @usePadded is true
435 case Constants::BAM_CIGAR_INS_CHAR :
437 alignEnd += op.Length;
440 // all other CIGAR chars do not affect end position
446 // adjust for closedInterval, if requested
447 if ( closedInterval )
454 /*! \fn std::string BamAlignment::GetErrorString(void) const
455 \brief Returns a human-readable description of the last error that occurred
457 This method allows elimination of STDERR pollution. Developers of client code
458 may choose how the messages are displayed to the user, if at all.
460 \return error description
462 std::string BamAlignment::GetErrorString(void) const {
466 /*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
467 \brief Identifies if an alignment has a soft clip. If so, identifies the
468 sizes of the soft clips, as well as their positions in the read and reference.
470 \param[out] clipSizes vector of the sizes of each soft clip in the alignment
471 \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
472 These positions are basically indexes within the read, not genomic positions.
473 \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
474 \param[in] usePadded inserted bases affect reported position. Default is false, so that
475 reported position stays 'sync-ed' with reference coordinates.
477 \return \c true if any soft clips were found in the alignment
479 bool BamAlignment::GetSoftClips(vector<int>& clipSizes,
480 vector<int>& readPositions,
481 vector<int>& genomePositions,
482 bool usePadded) const
484 // initialize positions & flags
485 int refPosition = Position;
486 int readPosition = 0;
487 bool softClipFound = false;
488 bool firstCigarOp = true;
490 // iterate over cigar operations
491 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
492 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
493 for ( ; cigarIter != cigarEnd; ++cigarIter) {
494 const CigarOp& op = (*cigarIter);
498 // increase both read & genome positions on CIGAR chars [DMXN=]
499 case Constants::BAM_CIGAR_DEL_CHAR :
500 case Constants::BAM_CIGAR_MATCH_CHAR :
501 case Constants::BAM_CIGAR_MISMATCH_CHAR :
502 case Constants::BAM_CIGAR_REFSKIP_CHAR :
503 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
504 refPosition += op.Length;
505 readPosition += op.Length;
508 // increase read position on insertion, genome position only if @usePadded is true
509 case Constants::BAM_CIGAR_INS_CHAR :
510 readPosition += op.Length;
512 refPosition += op.Length;
515 case Constants::BAM_CIGAR_SOFTCLIP_CHAR :
517 softClipFound = true;
519 //////////////////////////////////////////////////////////////////////////////
520 // if we are dealing with the *first* CIGAR operation
521 // for this alignment, we increment the read position so that
522 // the read and genome position of the clip are referring to the same base.
523 // For example, in the alignment below, the ref position would be 4, yet
524 // the read position would be 0. Thus, to "sync" the two,
525 // we need to increment the read position by the length of the
527 // Read: ATCGTTTCGTCCCTGC
528 // Ref: GGGATTTCGTCCCTGC
529 // Cigar: SSSSMMMMMMMMMMMM
531 // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
532 //////////////////////////////////////////////////////////////////////////////
534 readPosition += op.Length;
536 // track the soft clip's size, read position, and genome position
537 clipSizes.push_back(op.Length);
538 readPositions.push_back(readPosition);
539 genomePositions.push_back(refPosition);
541 // any other CIGAR operations have no effect
546 // clear our "first pass" flag
547 firstCigarOp = false;
550 // return whether any soft clips found
551 return softClipFound;
554 /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
555 \brief Retrieves the BAM tag type-code associated with requested tag name.
557 \param[in] tag 2-character tag name
558 \param[out] type retrieved (1-character) type-code
560 \return \c true if found
561 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
563 bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
565 // skip if alignment is core-only
566 if ( SupportData.HasCoreOnly ) {
567 // TODO: set error string?
571 // skip if no tags present
572 if ( TagData.empty() ) {
573 // TODO: set error string?
577 // localize the tag data
578 char* pTagData = (char*)TagData.data();
579 const unsigned int tagDataLength = TagData.size();
580 unsigned int numBytesParsed = 0;
582 // if tag not found, return failure
583 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
584 // TODO: set error string?
588 // otherwise, retrieve & validate tag type code
589 type = *(pTagData - 1);
591 case (Constants::BAM_TAG_TYPE_ASCII) :
592 case (Constants::BAM_TAG_TYPE_INT8) :
593 case (Constants::BAM_TAG_TYPE_UINT8) :
594 case (Constants::BAM_TAG_TYPE_INT16) :
595 case (Constants::BAM_TAG_TYPE_UINT16) :
596 case (Constants::BAM_TAG_TYPE_INT32) :
597 case (Constants::BAM_TAG_TYPE_UINT32) :
598 case (Constants::BAM_TAG_TYPE_FLOAT) :
599 case (Constants::BAM_TAG_TYPE_STRING) :
600 case (Constants::BAM_TAG_TYPE_HEX) :
601 case (Constants::BAM_TAG_TYPE_ARRAY) :
606 const string message = string("invalid tag type: ") + type;
607 SetErrorString("BamAlignment::GetTagType", message);
612 /*! \fn bool BamAlignment::HasTag(const std::string& tag) const
613 \brief Returns true if alignment has a record for requested tag.
615 \param[in] tag 2-character tag name
616 \return \c true if alignment has a record for tag
618 bool BamAlignment::HasTag(const std::string& tag) const {
620 // return false if no tag data present
621 if ( SupportData.HasCoreOnly || TagData.empty() )
624 // localize the tag data for lookup
625 char* pTagData = (char*)TagData.data();
626 const unsigned int tagDataLength = TagData.size();
627 unsigned int numBytesParsed = 0;
629 // if result of tag lookup
630 return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
633 /*! \fn bool BamAlignment::IsDuplicate(void) const
634 \return \c true if this read is a PCR duplicate
636 bool BamAlignment::IsDuplicate(void) const {
637 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 );
640 /*! \fn bool BamAlignment::IsFailedQC(void) const
641 \return \c true if this read failed quality control
643 bool BamAlignment::IsFailedQC(void) const {
644 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 );
647 /*! \fn bool BamAlignment::IsFirstMate(void) const
648 \return \c true if alignment is first mate on paired-end read
650 bool BamAlignment::IsFirstMate(void) const {
651 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 );
654 /*! \fn bool BamAlignment::IsMapped(void) const
655 \return \c true if alignment is mapped
657 bool BamAlignment::IsMapped(void) const {
658 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 );
661 /*! \fn bool BamAlignment::IsMateMapped(void) const
662 \return \c true if alignment's mate is mapped
664 bool BamAlignment::IsMateMapped(void) const {
665 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 );
668 /*! \fn bool BamAlignment::IsMateReverseStrand(void) const
669 \return \c true if alignment's mate mapped to reverse strand
671 bool BamAlignment::IsMateReverseStrand(void) const {
672 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 );
675 /*! \fn bool BamAlignment::IsPaired(void) const
676 \return \c true if alignment part of paired-end read
678 bool BamAlignment::IsPaired(void) const {
679 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 );
682 /*! \fn bool BamAlignment::IsPrimaryAlignment(void) const
683 \return \c true if reported position is primary alignment
685 bool BamAlignment::IsPrimaryAlignment(void) const {
686 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 );
689 /*! \fn bool BamAlignment::IsProperPair(void) const
690 \return \c true if alignment is part of read that satisfied paired-end resolution
692 bool BamAlignment::IsProperPair(void) const {
693 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 );
696 /*! \fn bool BamAlignment::IsReverseStrand(void) const
697 \return \c true if alignment mapped to reverse strand
699 bool BamAlignment::IsReverseStrand(void) const {
700 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 );
703 /*! \fn bool BamAlignment::IsSecondMate(void) const
704 \return \c true if alignment is second mate on read
706 bool BamAlignment::IsSecondMate(void) const {
707 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
710 /*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
713 Checks that tag name & type strings are expected sizes.
715 \param tag[in] BAM tag name
716 \param type[in] BAM tag type-code
717 \return \c true if both input strings are valid sizes
719 bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const {
720 return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
721 (type.size() == Constants::BAM_TAG_TYPESIZE);
724 /*! \fn void BamAlignment::RemoveTag(const std::string& tag)
725 \brief Removes field from BAM tags.
727 \param[in] tag 2-character name of field to remove
729 void BamAlignment::RemoveTag(const std::string& tag) {
731 // if char data not populated, do that first
732 if ( SupportData.HasCoreOnly )
735 // skip if no tags available
736 if ( TagData.empty() )
739 // localize the tag data
740 char* pOriginalTagData = (char*)TagData.data();
741 char* pTagData = pOriginalTagData;
742 const unsigned int originalTagDataLength = TagData.size();
743 unsigned int newTagDataLength = 0;
744 unsigned int numBytesParsed = 0;
746 // skip if tag not found
747 if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
750 // otherwise, remove it
751 RaiiBuffer newTagData(originalTagDataLength);
753 // copy original tag data up til desired tag
756 const unsigned int beginningTagDataLength = numBytesParsed;
757 newTagDataLength += beginningTagDataLength;
758 memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
760 // attemp to skip to next tag
761 const char* pTagStorageType = pTagData + 2;
764 if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
766 // squeeze remaining tag data
767 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
768 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
769 memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength );
771 // save modified tag data in alignment
772 TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
776 /*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
779 Sets a formatted error string for this alignment.
781 \param[in] where class/method where error occurred
782 \param[in] what description of error
784 void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const {
785 static const string SEPARATOR = ": ";
786 ErrorString = where + SEPARATOR + what;
789 /*! \fn void BamAlignment::SetIsDuplicate(bool ok)
790 \brief Sets value of "PCR duplicate" flag to \a ok.
792 void BamAlignment::SetIsDuplicate(bool ok) {
793 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE;
794 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE;
797 /*! \fn void BamAlignment::SetIsFailedQC(bool ok)
798 \brief Sets "failed quality control" flag to \a ok.
800 void BamAlignment::SetIsFailedQC(bool ok) {
801 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED;
802 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED;
805 /*! \fn void BamAlignment::SetIsFirstMate(bool ok)
806 \brief Sets "alignment is first mate" flag to \a ok.
808 void BamAlignment::SetIsFirstMate(bool ok) {
809 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1;
810 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1;
813 /*! \fn void BamAlignment::SetIsMapped(bool ok)
814 \brief Sets "alignment is mapped" flag to \a ok.
816 void BamAlignment::SetIsMapped(bool ok) {
817 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED;
818 else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED;
821 /*! \fn void BamAlignment::SetIsMateMapped(bool ok)
822 \brief Sets "alignment's mate is mapped" flag to \a ok.
824 void BamAlignment::SetIsMateMapped(bool ok) {
825 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
826 else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
829 /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
830 \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
832 void BamAlignment::SetIsMateReverseStrand(bool ok) {
833 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
834 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
837 /*! \fn void BamAlignment::SetIsPaired(bool ok)
838 \brief Sets "alignment part of paired-end read" flag to \a ok.
840 void BamAlignment::SetIsPaired(bool ok) {
841 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED;
842 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED;
845 /*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok)
846 \brief Sets "position is primary alignment" flag to \a ok.
848 void BamAlignment::SetIsPrimaryAlignment(bool ok) {
849 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY;
850 else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY;
853 /*! \fn void BamAlignment::SetIsProperPair(bool ok)
854 \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok.
856 void BamAlignment::SetIsProperPair(bool ok) {
857 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR;
858 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR;
861 /*! \fn void BamAlignment::SetIsReverseStrand(bool ok)
862 \brief Sets "alignment mapped to reverse strand" flag to \a ok.
864 void BamAlignment::SetIsReverseStrand(bool ok) {
865 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND;
866 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
869 /*! \fn void BamAlignment::SetIsSecondMate(bool ok)
870 \brief Sets "alignment is second mate on read" flag to \a ok.
872 void BamAlignment::SetIsSecondMate(bool ok) {
873 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2;
874 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
877 /*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
880 Moves to next available tag in tag data string
882 \param[in] storageType BAM tag type-code that determines how far to move cursor
883 \param[in,out] pTagData pointer to current position (cursor) in tag string
884 \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
886 \return \c if storageType was a recognized BAM tag type
888 \post \a pTagData will point to the byte where the next tag data begins.
889 \a numBytesParsed will correspond to the cursor's position in the full TagData string.
891 bool BamAlignment::SkipToNextTag(const char storageType,
893 unsigned int& numBytesParsed) const
895 switch (storageType) {
897 case (Constants::BAM_TAG_TYPE_ASCII) :
898 case (Constants::BAM_TAG_TYPE_INT8) :
899 case (Constants::BAM_TAG_TYPE_UINT8) :
904 case (Constants::BAM_TAG_TYPE_INT16) :
905 case (Constants::BAM_TAG_TYPE_UINT16) :
906 numBytesParsed += sizeof(uint16_t);
907 pTagData += sizeof(uint16_t);
910 case (Constants::BAM_TAG_TYPE_FLOAT) :
911 case (Constants::BAM_TAG_TYPE_INT32) :
912 case (Constants::BAM_TAG_TYPE_UINT32) :
913 numBytesParsed += sizeof(uint32_t);
914 pTagData += sizeof(uint32_t);
917 case (Constants::BAM_TAG_TYPE_STRING) :
918 case (Constants::BAM_TAG_TYPE_HEX) :
923 // increment for null-terminator
928 case (Constants::BAM_TAG_TYPE_ARRAY) :
932 const char arrayType = *pTagData;
936 // read number of elements
938 memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
939 numBytesParsed += sizeof(uint32_t);
940 pTagData += sizeof(uint32_t);
942 // calculate number of bytes to skip
945 case (Constants::BAM_TAG_TYPE_INT8) :
946 case (Constants::BAM_TAG_TYPE_UINT8) :
947 bytesToSkip = numElements;
949 case (Constants::BAM_TAG_TYPE_INT16) :
950 case (Constants::BAM_TAG_TYPE_UINT16) :
951 bytesToSkip = numElements*sizeof(uint16_t);
953 case (Constants::BAM_TAG_TYPE_FLOAT) :
954 case (Constants::BAM_TAG_TYPE_INT32) :
955 case (Constants::BAM_TAG_TYPE_UINT32) :
956 bytesToSkip = numElements*sizeof(uint32_t);
959 const string message = string("invalid binary array type: ") + arrayType;
960 SetErrorString("BamAlignment::SkipToNextTag", message);
964 // skip binary array contents
965 numBytesParsed += bytesToSkip;
966 pTagData += bytesToSkip;
971 const string message = string("invalid tag type: ") + storageType;
972 SetErrorString("BamAlignment::SkipToNextTag", message);
976 // if we get here, tag skipped OK - return success