1 // ***************************************************************************
2 // BamAlignment.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 18 November 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/BamConstants.h"
12 using namespace BamTools;
15 /*! \class BamTools::BamAlignment
16 \brief The main BAM alignment data structure.
18 Provides methods to query/modify BAM alignment data fields.
20 /*! \var BamAlignment::Name
23 /*! \var BamAlignment::Length
24 \brief length of query sequence
26 /*! \var BamAlignment::QueryBases
27 \brief 'original' sequence (as reported from sequencing machine)
29 \note Setting this field to "*" indicates that the sequence is not to be stored on output.
30 In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*").
32 /*! \var BamAlignment::AlignedBases
33 \brief 'aligned' sequence (includes any indels, padding, clipping)
35 This field will be completely empty after reading from BamReader/BamMultiReader when
38 /*! \var BamAlignment::Qualities
39 \brief FASTQ qualities (ASCII characters, not numeric values)
41 \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored,
42 but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and
43 the length of this field should equal the length of QueryBases.
45 /*! \var BamAlignment::TagData
46 \brief tag data (use the provided methods to query/modify)
48 /*! \var BamAlignment::RefID
49 \brief ID number for reference sequence
51 /*! \var BamAlignment::Position
52 \brief position (0-based) where alignment starts
54 /*! \var BamAlignment::Bin
55 \brief BAM (standard) index bin number for this alignment
57 /*! \var BamAlignment::MapQuality
58 \brief mapping quality score
60 /*! \var BamAlignment::AlignmentFlag
61 \brief alignment bit-flag (use the provided methods to query/modify)
63 /*! \var BamAlignment::CigarData
64 \brief CIGAR operations for this alignment
66 /*! \var BamAlignment::MateRefID
67 \brief ID number for reference sequence where alignment's mate was aligned
69 /*! \var BamAlignment::MatePosition
70 \brief position (0-based) where alignment's mate starts
72 /*! \var BamAlignment::InsertSize
73 \brief mate-pair insert size
75 /*! \var BamAlignment::Filename
76 \brief name of BAM file which this alignment comes from
79 /*! \fn BamAlignment::BamAlignment(void)
82 BamAlignment::BamAlignment(void)
94 /*! \fn BamAlignment::BamAlignment(const BamAlignment& other)
95 \brief copy constructor
97 BamAlignment::BamAlignment(const BamAlignment& other)
99 , Length(other.Length)
100 , QueryBases(other.QueryBases)
101 , AlignedBases(other.AlignedBases)
102 , Qualities(other.Qualities)
103 , TagData(other.TagData)
105 , Position(other.Position)
107 , MapQuality(other.MapQuality)
108 , AlignmentFlag(other.AlignmentFlag)
109 , CigarData(other.CigarData)
110 , MateRefID(other.MateRefID)
111 , MatePosition(other.MatePosition)
112 , InsertSize(other.InsertSize)
113 , Filename(other.Filename)
114 , SupportData(other.SupportData)
117 /*! \fn BamAlignment::~BamAlignment(void)
120 BamAlignment::~BamAlignment(void) { }
122 /*! \fn bool BamAlignment::BuildCharData(void)
123 \brief Populates alignment string fields (read name, bases, qualities, tag data).
125 An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
126 Using that method makes parsing much quicker when only positional data is required.
128 However, if you later want to access the character data fields from such an alignment,
129 use this method to populate those fields. Provides ability to do 'lazy evaluation' of
132 \return \c true if character data populated successfully (or was already available to begin with)
134 bool BamAlignment::BuildCharData(void) {
136 // skip if char data already parsed
137 if ( !SupportData.HasCoreOnly )
140 // check system endianness
141 bool IsBigEndian = BamTools::SystemIsBigEndian();
143 // calculate character lengths/offsets
144 const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
145 const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4);
146 const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
147 const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
148 const unsigned int tagDataLength = dataLength - tagDataOffset;
150 // check offsets to see what char data exists
151 const bool hasSeqData = ( seqDataOffset < qualDataOffset );
152 const bool hasQualData = ( qualDataOffset < tagDataOffset );
153 const bool hasTagData = ( tagDataOffset < dataLength );
155 // store alignment name (relies on null char in name as terminator)
156 Name.assign(SupportData.AllCharData.data());
158 // save query sequence
161 const char* seqData = SupportData.AllCharData.data() + seqDataOffset;
162 QueryBases.reserve(SupportData.QuerySequenceLength);
163 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
164 const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
165 QueryBases.append(1, singleBase);
173 const char* qualData = SupportData.AllCharData.data() + qualDataOffset;
175 // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs
176 if ( qualData[0] == (char)0xFF )
177 Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF);
179 // otherwise convert from numeric QV to 'FASTQ-style' ASCII character
181 Qualities.reserve(SupportData.QuerySequenceLength);
182 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i )
183 Qualities.append(1, qualData[i]+33);
187 // clear previous AlignedBases
188 AlignedBases.clear();
190 // if QueryBases has data, build AlignedBases using CIGAR data
191 // otherwise, AlignedBases will remain empty (this case IS allowed)
192 if ( !QueryBases.empty() && QueryBases != "*" ) {
194 // resize AlignedBases
195 AlignedBases.reserve(SupportData.QuerySequenceLength);
197 // iterate over CigarOps
199 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
200 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
201 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
202 const CigarOp& op = (*cigarIter);
206 // for 'M', 'I', '=', 'X' - write bases
207 case (Constants::BAM_CIGAR_MATCH_CHAR) :
208 case (Constants::BAM_CIGAR_INS_CHAR) :
209 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) :
210 case (Constants::BAM_CIGAR_MISMATCH_CHAR) :
211 AlignedBases.append(QueryBases.substr(k, op.Length));
214 // for 'S' - soft clip, do not write bases
215 // but increment placeholder 'k'
216 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) :
220 // for 'D' - write gap character
221 case (Constants::BAM_CIGAR_DEL_CHAR) :
222 AlignedBases.append(op.Length, Constants::BAM_DNA_DEL);
225 // for 'P' - write padding character
226 case (Constants::BAM_CIGAR_PAD_CHAR) :
227 AlignedBases.append( op.Length, Constants::BAM_DNA_PAD );
230 // for 'N' - write N's, skip bases in original query sequence
231 case (Constants::BAM_CIGAR_REFSKIP_CHAR) :
232 AlignedBases.append( op.Length, Constants::BAM_DNA_N );
235 // for 'H' - hard clip, do nothing to AlignedBases, move to next op
236 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) :
239 // invalid CIGAR op-code
241 const string message = string("invalid CIGAR operation type: ") + op.Type;
242 SetErrorString("BamAlignment::BuildCharData", message);
252 char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset);
256 while ( i < tagDataLength ) {
258 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
259 const char type = tagData[i]; // get tag type at position i
260 ++i; // move i past tag type
264 case(Constants::BAM_TAG_TYPE_ASCII) :
265 case(Constants::BAM_TAG_TYPE_INT8) :
266 case(Constants::BAM_TAG_TYPE_UINT8) :
267 // no endian swapping necessary for single-byte data
271 case(Constants::BAM_TAG_TYPE_INT16) :
272 case(Constants::BAM_TAG_TYPE_UINT16) :
273 BamTools::SwapEndian_16p(&tagData[i]);
274 i += sizeof(uint16_t);
277 case(Constants::BAM_TAG_TYPE_FLOAT) :
278 case(Constants::BAM_TAG_TYPE_INT32) :
279 case(Constants::BAM_TAG_TYPE_UINT32) :
280 BamTools::SwapEndian_32p(&tagData[i]);
281 i += sizeof(uint32_t);
284 case(Constants::BAM_TAG_TYPE_HEX) :
285 case(Constants::BAM_TAG_TYPE_STRING) :
286 // no endian swapping necessary for hex-string/string data
289 // increment one more for null terminator
293 case(Constants::BAM_TAG_TYPE_ARRAY) :
297 const char arrayType = tagData[i];
300 // swap endian-ness of number of elements in place, then retrieve for loop
301 BamTools::SwapEndian_32p(&tagData[i]);
302 uint32_t numElements;
303 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
304 i += sizeof(uint32_t);
306 // swap endian-ness of array elements
307 for ( size_t j = 0; j < numElements; ++j ) {
309 case (Constants::BAM_TAG_TYPE_INT8) :
310 case (Constants::BAM_TAG_TYPE_UINT8) :
311 // no endian-swapping necessary
314 case (Constants::BAM_TAG_TYPE_INT16) :
315 case (Constants::BAM_TAG_TYPE_UINT16) :
316 BamTools::SwapEndian_16p(&tagData[i]);
317 i += sizeof(uint16_t);
319 case (Constants::BAM_TAG_TYPE_FLOAT) :
320 case (Constants::BAM_TAG_TYPE_INT32) :
321 case (Constants::BAM_TAG_TYPE_UINT32) :
322 BamTools::SwapEndian_32p(&tagData[i]);
323 i += sizeof(uint32_t);
326 const string message = string("invalid binary array type: ") + arrayType;
327 SetErrorString("BamAlignment::BuildCharData", message);
335 // invalid tag type-code
337 const string message = string("invalid tag type: ") + type;
338 SetErrorString("BamAlignment::BuildCharData", message);
344 // store tagData in alignment
345 TagData.resize(tagDataLength);
346 memcpy((char*)(TagData.data()), tagData, tagDataLength);
349 // clear core-only flag & return success
350 SupportData.HasCoreOnly = false;
354 /*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
357 Searches for requested tag in BAM tag data.
359 \param[in] tag requested 2-character tag name
360 \param[in,out] pTagData pointer to current position in BamAlignment::TagData
361 \param[in] tagDataLength length of BamAlignment::TagData
362 \param[in,out] numBytesParsed number of bytes parsed so far
364 \return \c true if found
366 \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
367 \a numBytesParsed will correspond to the position in the full TagData string.
370 bool BamAlignment::FindTag(const std::string& tag,
372 const unsigned int& tagDataLength,
373 unsigned int& numBytesParsed) const
376 while ( numBytesParsed < tagDataLength ) {
378 const char* pTagType = pTagData;
379 const char* pTagStorageType = pTagData + 2;
383 // check the current tag, return true on match
384 if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
387 // get the storage class and find the next tag
388 if ( *pTagStorageType == '\0' ) return false;
389 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
390 if ( *pTagData == '\0' ) return false;
393 // checked all tags, none match
397 /*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
398 \brief Calculates alignment end position, based on its starting position and CIGAR data.
400 \warning The position returned now represents a zero-based, HALF-OPEN interval.
401 In previous versions of BamTools (0.x & 1.x) all intervals were treated
402 as zero-based, CLOSED.
404 \param[in] usePadded Allow inserted bases to affect the reported position. Default is
405 false, so that reported position stays synced with reference
407 \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
408 false, so that his value represents a standard, half-open interval.
410 \return alignment end position
412 int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
414 // initialize alignment end to starting position
415 int alignEnd = Position;
417 // iterate over cigar operations
418 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
419 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
420 for ( ; cigarIter != cigarEnd; ++cigarIter) {
421 const CigarOp& op = (*cigarIter);
425 // increase end position on CIGAR chars [DMXN=]
426 case Constants::BAM_CIGAR_DEL_CHAR :
427 case Constants::BAM_CIGAR_MATCH_CHAR :
428 case Constants::BAM_CIGAR_MISMATCH_CHAR :
429 case Constants::BAM_CIGAR_REFSKIP_CHAR :
430 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
431 alignEnd += op.Length;
434 // increase end position on insertion, only if @usePadded is true
435 case Constants::BAM_CIGAR_INS_CHAR :
437 alignEnd += op.Length;
440 // all other CIGAR chars do not affect end position
446 // adjust for closedInterval, if requested
447 if ( closedInterval )
454 /*! \fn std::string BamAlignment::GetErrorString(void) const
455 \brief Returns a human-readable description of the last error that occurred
457 This method allows elimination of STDERR pollution. Developers of client code
458 may choose how the messages are displayed to the user, if at all.
460 \return error description
462 std::string BamAlignment::GetErrorString(void) const {
466 /*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
467 \brief Identifies if an alignment has a soft clip. If so, identifies the
468 sizes of the soft clips, as well as their positions in the read and reference.
470 \param[out] clipSizes vector of the sizes of each soft clip in the alignment
471 \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
472 These positions are basically indexes within the read, not genomic positions.
473 \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
474 \param[in] usePadded inserted bases affect reported position. Default is false, so that
475 reported position stays 'sync-ed' with reference coordinates.
477 \return \c true if any soft clips were found in the alignment
479 bool BamAlignment::GetSoftClips(vector<int>& clipSizes,
480 vector<int>& readPositions,
481 vector<int>& genomePositions,
482 bool usePadded) const
484 // initialize positions & flags
485 int refPosition = Position;
486 int readPosition = 0;
487 bool softClipFound = false;
488 bool firstCigarOp = true;
490 // iterate over cigar operations
491 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
492 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
493 for ( ; cigarIter != cigarEnd; ++cigarIter) {
494 const CigarOp& op = (*cigarIter);
498 // increase both read & genome positions on CIGAR chars [DMXN=]
499 case Constants::BAM_CIGAR_DEL_CHAR :
500 case Constants::BAM_CIGAR_MATCH_CHAR :
501 case Constants::BAM_CIGAR_MISMATCH_CHAR :
502 case Constants::BAM_CIGAR_REFSKIP_CHAR :
503 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
504 refPosition += op.Length;
505 readPosition += op.Length;
508 // increase read position on insertion, genome position only if @usePadded is true
509 case Constants::BAM_CIGAR_INS_CHAR :
510 readPosition += op.Length;
512 refPosition += op.Length;
515 case Constants::BAM_CIGAR_SOFTCLIP_CHAR :
517 softClipFound = true;
519 //////////////////////////////////////////////////////////////////////////////
520 // if we are dealing with the *first* CIGAR operation
521 // for this alignment, we increment the read position so that
522 // the read and genome position of the clip are referring to the same base.
523 // For example, in the alignment below, the ref position would be 4, yet
524 // the read position would be 0. Thus, to "sync" the two,
525 // we need to increment the read position by the length of the
527 // Read: ATCGTTTCGTCCCTGC
528 // Ref: GGGATTTCGTCCCTGC
529 // Cigar: SSSSMMMMMMMMMMMM
531 // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
532 //////////////////////////////////////////////////////////////////////////////
534 readPosition += op.Length;
536 // track the soft clip's size, read position, and genome position
537 clipSizes.push_back(op.Length);
538 readPositions.push_back(readPosition);
539 genomePositions.push_back(refPosition);
541 // any other CIGAR operations have no effect
546 // clear our "first pass" flag
547 firstCigarOp = false;
550 // return whether any soft clips found
551 return softClipFound;
554 /*! \fn std::vector<std::string> BamAlignment::GetTagNames(void) const
555 \brief Retrieves the BAM tag names.
557 When paired with GetTagType() and GetTag(), this method allows you
558 to iterate over an alignment's tag data without knowing the names (or types)
561 \return \c vector containing all tag names found (empty if none available)
562 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
564 std::vector<std::string> BamAlignment::GetTagNames(void) const {
566 std::vector<std::string> result;
567 if ( SupportData.HasCoreOnly || TagData.empty() )
570 char* pTagData = (char*)TagData.data();
571 const unsigned int tagDataLength = TagData.size();
572 unsigned int numBytesParsed = 0;
573 while ( numBytesParsed < tagDataLength ) {
575 // get current tag name & type
576 const char* pTagName = pTagData;
577 const char* pTagType = pTagData + 2;
582 result.push_back( std::string(pTagName, 2) );
585 if ( *pTagType == '\0' ) break;
586 if ( !SkipToNextTag(*pTagType, pTagData, numBytesParsed) ) break;
587 if ( *pTagData == '\0' ) break;
593 /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
594 \brief Retrieves the BAM tag type-code associated with requested tag name.
596 \param[in] tag 2-character tag name
597 \param[out] type retrieved (1-character) type-code
599 \return \c true if found
600 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
602 bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
604 // skip if alignment is core-only
605 if ( SupportData.HasCoreOnly ) {
606 // TODO: set error string?
610 // skip if no tags present
611 if ( TagData.empty() ) {
612 // TODO: set error string?
616 // localize the tag data
617 char* pTagData = (char*)TagData.data();
618 const unsigned int tagDataLength = TagData.size();
619 unsigned int numBytesParsed = 0;
621 // if tag not found, return failure
622 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
623 // TODO: set error string?
627 // otherwise, retrieve & validate tag type code
628 type = *(pTagData - 1);
630 case (Constants::BAM_TAG_TYPE_ASCII) :
631 case (Constants::BAM_TAG_TYPE_INT8) :
632 case (Constants::BAM_TAG_TYPE_UINT8) :
633 case (Constants::BAM_TAG_TYPE_INT16) :
634 case (Constants::BAM_TAG_TYPE_UINT16) :
635 case (Constants::BAM_TAG_TYPE_INT32) :
636 case (Constants::BAM_TAG_TYPE_UINT32) :
637 case (Constants::BAM_TAG_TYPE_FLOAT) :
638 case (Constants::BAM_TAG_TYPE_STRING) :
639 case (Constants::BAM_TAG_TYPE_HEX) :
640 case (Constants::BAM_TAG_TYPE_ARRAY) :
645 const string message = string("invalid tag type: ") + type;
646 SetErrorString("BamAlignment::GetTagType", message);
651 /*! \fn bool BamAlignment::HasTag(const std::string& tag) const
652 \brief Returns true if alignment has a record for requested tag.
654 \param[in] tag 2-character tag name
655 \return \c true if alignment has a record for tag
657 bool BamAlignment::HasTag(const std::string& tag) const {
659 // return false if no tag data present
660 if ( SupportData.HasCoreOnly || TagData.empty() )
663 // localize the tag data for lookup
664 char* pTagData = (char*)TagData.data();
665 const unsigned int tagDataLength = TagData.size();
666 unsigned int numBytesParsed = 0;
668 // if result of tag lookup
669 return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
672 /*! \fn bool BamAlignment::IsDuplicate(void) const
673 \return \c true if this read is a PCR duplicate
675 bool BamAlignment::IsDuplicate(void) const {
676 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 );
679 /*! \fn bool BamAlignment::IsFailedQC(void) const
680 \return \c true if this read failed quality control
682 bool BamAlignment::IsFailedQC(void) const {
683 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 );
686 /*! \fn bool BamAlignment::IsFirstMate(void) const
687 \return \c true if alignment is first mate on paired-end read
689 bool BamAlignment::IsFirstMate(void) const {
690 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 );
693 /*! \fn bool BamAlignment::IsMapped(void) const
694 \return \c true if alignment is mapped
696 bool BamAlignment::IsMapped(void) const {
697 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 );
700 /*! \fn bool BamAlignment::IsMateMapped(void) const
701 \return \c true if alignment's mate is mapped
703 bool BamAlignment::IsMateMapped(void) const {
704 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 );
707 /*! \fn bool BamAlignment::IsMateReverseStrand(void) const
708 \return \c true if alignment's mate mapped to reverse strand
710 bool BamAlignment::IsMateReverseStrand(void) const {
711 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 );
714 /*! \fn bool BamAlignment::IsPaired(void) const
715 \return \c true if alignment part of paired-end read
717 bool BamAlignment::IsPaired(void) const {
718 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 );
721 /*! \fn bool BamAlignment::IsPrimaryAlignment(void) const
722 \return \c true if reported position is primary alignment
724 bool BamAlignment::IsPrimaryAlignment(void) const {
725 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 );
728 /*! \fn bool BamAlignment::IsProperPair(void) const
729 \return \c true if alignment is part of read that satisfied paired-end resolution
731 bool BamAlignment::IsProperPair(void) const {
732 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 );
735 /*! \fn bool BamAlignment::IsReverseStrand(void) const
736 \return \c true if alignment mapped to reverse strand
738 bool BamAlignment::IsReverseStrand(void) const {
739 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 );
742 /*! \fn bool BamAlignment::IsSecondMate(void) const
743 \return \c true if alignment is second mate on read
745 bool BamAlignment::IsSecondMate(void) const {
746 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
749 /*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
752 Checks that tag name & type strings are expected sizes.
754 \param tag[in] BAM tag name
755 \param type[in] BAM tag type-code
756 \return \c true if both input strings are valid sizes
758 bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const {
759 return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
760 (type.size() == Constants::BAM_TAG_TYPESIZE);
763 /*! \fn void BamAlignment::RemoveTag(const std::string& tag)
764 \brief Removes field from BAM tags.
766 \param[in] tag 2-character name of field to remove
768 void BamAlignment::RemoveTag(const std::string& tag) {
770 // if char data not populated, do that first
771 if ( SupportData.HasCoreOnly )
774 // skip if no tags available
775 if ( TagData.empty() )
778 // localize the tag data
779 char* pOriginalTagData = (char*)TagData.data();
780 char* pTagData = pOriginalTagData;
781 const unsigned int originalTagDataLength = TagData.size();
782 unsigned int newTagDataLength = 0;
783 unsigned int numBytesParsed = 0;
785 // skip if tag not found
786 if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
789 // otherwise, remove it
790 RaiiBuffer newTagData(originalTagDataLength);
792 // copy original tag data up til desired tag
795 const unsigned int beginningTagDataLength = numBytesParsed;
796 newTagDataLength += beginningTagDataLength;
797 memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
799 // attemp to skip to next tag
800 const char* pTagStorageType = pTagData + 2;
803 if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
805 // squeeze remaining tag data
806 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
807 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
808 memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength );
810 // save modified tag data in alignment
811 TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
815 /*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
818 Sets a formatted error string for this alignment.
820 \param[in] where class/method where error occurred
821 \param[in] what description of error
823 void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const {
824 static const string SEPARATOR = ": ";
825 ErrorString = where + SEPARATOR + what;
828 /*! \fn void BamAlignment::SetIsDuplicate(bool ok)
829 \brief Sets value of "PCR duplicate" flag to \a ok.
831 void BamAlignment::SetIsDuplicate(bool ok) {
832 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE;
833 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE;
836 /*! \fn void BamAlignment::SetIsFailedQC(bool ok)
837 \brief Sets "failed quality control" flag to \a ok.
839 void BamAlignment::SetIsFailedQC(bool ok) {
840 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED;
841 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED;
844 /*! \fn void BamAlignment::SetIsFirstMate(bool ok)
845 \brief Sets "alignment is first mate" flag to \a ok.
847 void BamAlignment::SetIsFirstMate(bool ok) {
848 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1;
849 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1;
852 /*! \fn void BamAlignment::SetIsMapped(bool ok)
853 \brief Sets "alignment is mapped" flag to \a ok.
855 void BamAlignment::SetIsMapped(bool ok) {
856 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED;
857 else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED;
860 /*! \fn void BamAlignment::SetIsMateMapped(bool ok)
861 \brief Sets "alignment's mate is mapped" flag to \a ok.
863 void BamAlignment::SetIsMateMapped(bool ok) {
864 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
865 else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
868 /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
869 \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
871 void BamAlignment::SetIsMateReverseStrand(bool ok) {
872 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
873 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
876 /*! \fn void BamAlignment::SetIsPaired(bool ok)
877 \brief Sets "alignment part of paired-end read" flag to \a ok.
879 void BamAlignment::SetIsPaired(bool ok) {
880 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED;
881 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED;
884 /*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok)
885 \brief Sets "position is primary alignment" flag to \a ok.
887 void BamAlignment::SetIsPrimaryAlignment(bool ok) {
888 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY;
889 else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY;
892 /*! \fn void BamAlignment::SetIsProperPair(bool ok)
893 \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok.
895 void BamAlignment::SetIsProperPair(bool ok) {
896 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR;
897 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR;
900 /*! \fn void BamAlignment::SetIsReverseStrand(bool ok)
901 \brief Sets "alignment mapped to reverse strand" flag to \a ok.
903 void BamAlignment::SetIsReverseStrand(bool ok) {
904 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND;
905 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
908 /*! \fn void BamAlignment::SetIsSecondMate(bool ok)
909 \brief Sets "alignment is second mate on read" flag to \a ok.
911 void BamAlignment::SetIsSecondMate(bool ok) {
912 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2;
913 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
916 /*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
919 Moves to next available tag in tag data string
921 \param[in] storageType BAM tag type-code that determines how far to move cursor
922 \param[in,out] pTagData pointer to current position (cursor) in tag string
923 \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
925 \return \c if storageType was a recognized BAM tag type
927 \post \a pTagData will point to the byte where the next tag data begins.
928 \a numBytesParsed will correspond to the cursor's position in the full TagData string.
930 bool BamAlignment::SkipToNextTag(const char storageType,
932 unsigned int& numBytesParsed) const
934 switch (storageType) {
936 case (Constants::BAM_TAG_TYPE_ASCII) :
937 case (Constants::BAM_TAG_TYPE_INT8) :
938 case (Constants::BAM_TAG_TYPE_UINT8) :
943 case (Constants::BAM_TAG_TYPE_INT16) :
944 case (Constants::BAM_TAG_TYPE_UINT16) :
945 numBytesParsed += sizeof(uint16_t);
946 pTagData += sizeof(uint16_t);
949 case (Constants::BAM_TAG_TYPE_FLOAT) :
950 case (Constants::BAM_TAG_TYPE_INT32) :
951 case (Constants::BAM_TAG_TYPE_UINT32) :
952 numBytesParsed += sizeof(uint32_t);
953 pTagData += sizeof(uint32_t);
956 case (Constants::BAM_TAG_TYPE_STRING) :
957 case (Constants::BAM_TAG_TYPE_HEX) :
962 // increment for null-terminator
967 case (Constants::BAM_TAG_TYPE_ARRAY) :
971 const char arrayType = *pTagData;
975 // read number of elements
977 memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
978 numBytesParsed += sizeof(uint32_t);
979 pTagData += sizeof(uint32_t);
981 // calculate number of bytes to skip
984 case (Constants::BAM_TAG_TYPE_INT8) :
985 case (Constants::BAM_TAG_TYPE_UINT8) :
986 bytesToSkip = numElements;
988 case (Constants::BAM_TAG_TYPE_INT16) :
989 case (Constants::BAM_TAG_TYPE_UINT16) :
990 bytesToSkip = numElements*sizeof(uint16_t);
992 case (Constants::BAM_TAG_TYPE_FLOAT) :
993 case (Constants::BAM_TAG_TYPE_INT32) :
994 case (Constants::BAM_TAG_TYPE_UINT32) :
995 bytesToSkip = numElements*sizeof(uint32_t);
998 const string message = string("invalid binary array type: ") + arrayType;
999 SetErrorString("BamAlignment::SkipToNextTag", message);
1003 // skip binary array contents
1004 numBytesParsed += bytesToSkip;
1005 pTagData += bytesToSkip;
1010 const string message = string("invalid tag type: ") + storageType;
1011 SetErrorString("BamAlignment::SkipToNextTag", message);
1015 // if we get here, tag skipped OK - return success