1 // ***************************************************************************
2 // BamAlignment.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 4 December 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/BamConstants.h"
12 using namespace BamTools;
15 /*! \class BamTools::BamAlignment
16 \brief The main BAM alignment data structure.
18 Provides methods to query/modify BAM alignment data fields.
20 /*! \var BamAlignment::Name
23 /*! \var BamAlignment::Length
24 \brief length of query sequence
26 /*! \var BamAlignment::QueryBases
27 \brief 'original' sequence (as reported from sequencing machine)
29 \note Setting this field to "*" indicates that the sequence is not to be stored on output.
30 In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*").
32 /*! \var BamAlignment::AlignedBases
33 \brief 'aligned' sequence (includes any indels, padding, clipping)
35 This field will be completely empty after reading from BamReader/BamMultiReader when
38 /*! \var BamAlignment::Qualities
39 \brief FASTQ qualities (ASCII characters, not numeric values)
41 \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored,
42 but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and
43 the length of this field should equal the length of QueryBases.
45 /*! \var BamAlignment::TagData
46 \brief tag data (use the provided methods to query/modify)
48 /*! \var BamAlignment::RefID
49 \brief ID number for reference sequence
51 /*! \var BamAlignment::Position
52 \brief position (0-based) where alignment starts
54 /*! \var BamAlignment::Bin
55 \brief BAM (standard) index bin number for this alignment
57 /*! \var BamAlignment::MapQuality
58 \brief mapping quality score
60 /*! \var BamAlignment::AlignmentFlag
61 \brief alignment bit-flag (use the provided methods to query/modify)
63 /*! \var BamAlignment::CigarData
64 \brief CIGAR operations for this alignment
66 /*! \var BamAlignment::MateRefID
67 \brief ID number for reference sequence where alignment's mate was aligned
69 /*! \var BamAlignment::MatePosition
70 \brief position (0-based) where alignment's mate starts
72 /*! \var BamAlignment::InsertSize
73 \brief mate-pair insert size
75 /*! \var BamAlignment::Filename
76 \brief name of BAM file which this alignment comes from
79 /*! \fn BamAlignment::BamAlignment(void)
82 BamAlignment::BamAlignment(void)
94 /*! \fn BamAlignment::BamAlignment(const BamAlignment& other)
95 \brief copy constructor
97 BamAlignment::BamAlignment(const BamAlignment& other)
99 , Length(other.Length)
100 , QueryBases(other.QueryBases)
101 , AlignedBases(other.AlignedBases)
102 , Qualities(other.Qualities)
103 , TagData(other.TagData)
105 , Position(other.Position)
107 , MapQuality(other.MapQuality)
108 , AlignmentFlag(other.AlignmentFlag)
109 , CigarData(other.CigarData)
110 , MateRefID(other.MateRefID)
111 , MatePosition(other.MatePosition)
112 , InsertSize(other.InsertSize)
113 , Filename(other.Filename)
114 , SupportData(other.SupportData)
117 /*! \fn BamAlignment::~BamAlignment(void)
120 BamAlignment::~BamAlignment(void) { }
122 /*! \fn bool BamAlignment::BuildCharData(void)
123 \brief Populates alignment string fields (read name, bases, qualities, tag data).
125 An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
126 Using that method makes parsing much quicker when only positional data is required.
128 However, if you later want to access the character data fields from such an alignment,
129 use this method to populate those fields. Provides ability to do 'lazy evaluation' of
132 \return \c true if character data populated successfully (or was already available to begin with)
134 bool BamAlignment::BuildCharData(void) {
136 // skip if char data already parsed
137 if ( !SupportData.HasCoreOnly )
140 // check system endianness
141 bool IsBigEndian = BamTools::SystemIsBigEndian();
143 // calculate character lengths/offsets
144 const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
145 const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4);
146 const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
147 const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
148 const unsigned int tagDataLength = dataLength - tagDataOffset;
150 // check offsets to see what char data exists
151 const bool hasSeqData = ( seqDataOffset < qualDataOffset );
152 const bool hasQualData = ( qualDataOffset < tagDataOffset );
153 const bool hasTagData = ( tagDataOffset < dataLength );
155 // store alignment name (relies on null char in name as terminator)
156 Name.assign(SupportData.AllCharData.data());
158 // save query sequence
161 const char* seqData = SupportData.AllCharData.data() + seqDataOffset;
162 QueryBases.reserve(SupportData.QuerySequenceLength);
163 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
164 const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
165 QueryBases.append(1, singleBase);
173 const char* qualData = SupportData.AllCharData.data() + qualDataOffset;
175 // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs
176 if ( qualData[0] == (char)0xFF )
177 Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF);
179 // otherwise convert from numeric QV to 'FASTQ-style' ASCII character
181 Qualities.reserve(SupportData.QuerySequenceLength);
182 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i )
183 Qualities.append(1, qualData[i]+33);
187 // clear previous AlignedBases
188 AlignedBases.clear();
190 // if QueryBases has data, build AlignedBases using CIGAR data
191 // otherwise, AlignedBases will remain empty (this case IS allowed)
192 if ( !QueryBases.empty() && QueryBases != "*" ) {
194 // resize AlignedBases
195 AlignedBases.reserve(SupportData.QuerySequenceLength);
197 // iterate over CigarOps
199 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
200 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
201 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
202 const CigarOp& op = (*cigarIter);
206 // for 'M', 'I', '=', 'X' - write bases
207 case (Constants::BAM_CIGAR_MATCH_CHAR) :
208 case (Constants::BAM_CIGAR_INS_CHAR) :
209 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) :
210 case (Constants::BAM_CIGAR_MISMATCH_CHAR) :
211 AlignedBases.append(QueryBases.substr(k, op.Length));
214 // for 'S' - soft clip, do not write bases
215 // but increment placeholder 'k'
216 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) :
220 // for 'D' - write gap character
221 case (Constants::BAM_CIGAR_DEL_CHAR) :
222 AlignedBases.append(op.Length, Constants::BAM_DNA_DEL);
225 // for 'P' - write padding character
226 case (Constants::BAM_CIGAR_PAD_CHAR) :
227 AlignedBases.append( op.Length, Constants::BAM_DNA_PAD );
230 // for 'N' - write N's, skip bases in original query sequence
231 case (Constants::BAM_CIGAR_REFSKIP_CHAR) :
232 AlignedBases.append( op.Length, Constants::BAM_DNA_N );
235 // for 'H' - hard clip, do nothing to AlignedBases, move to next op
236 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) :
239 // invalid CIGAR op-code
241 const string message = string("invalid CIGAR operation type: ") + op.Type;
242 SetErrorString("BamAlignment::BuildCharData", message);
252 char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset);
256 while ( i < tagDataLength ) {
258 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
259 const char type = tagData[i]; // get tag type at position i
260 ++i; // move i past tag type
264 case(Constants::BAM_TAG_TYPE_ASCII) :
265 case(Constants::BAM_TAG_TYPE_INT8) :
266 case(Constants::BAM_TAG_TYPE_UINT8) :
267 // no endian swapping necessary for single-byte data
271 case(Constants::BAM_TAG_TYPE_INT16) :
272 case(Constants::BAM_TAG_TYPE_UINT16) :
273 BamTools::SwapEndian_16p(&tagData[i]);
274 i += sizeof(uint16_t);
277 case(Constants::BAM_TAG_TYPE_FLOAT) :
278 case(Constants::BAM_TAG_TYPE_INT32) :
279 case(Constants::BAM_TAG_TYPE_UINT32) :
280 BamTools::SwapEndian_32p(&tagData[i]);
281 i += sizeof(uint32_t);
284 case(Constants::BAM_TAG_TYPE_HEX) :
285 case(Constants::BAM_TAG_TYPE_STRING) :
286 // no endian swapping necessary for hex-string/string data
289 // increment one more for null terminator
293 case(Constants::BAM_TAG_TYPE_ARRAY) :
297 const char arrayType = tagData[i];
300 // swap endian-ness of number of elements in place, then retrieve for loop
301 BamTools::SwapEndian_32p(&tagData[i]);
302 uint32_t numElements;
303 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
304 i += sizeof(uint32_t);
306 // swap endian-ness of array elements
307 for ( size_t j = 0; j < numElements; ++j ) {
309 case (Constants::BAM_TAG_TYPE_INT8) :
310 case (Constants::BAM_TAG_TYPE_UINT8) :
311 // no endian-swapping necessary
314 case (Constants::BAM_TAG_TYPE_INT16) :
315 case (Constants::BAM_TAG_TYPE_UINT16) :
316 BamTools::SwapEndian_16p(&tagData[i]);
317 i += sizeof(uint16_t);
319 case (Constants::BAM_TAG_TYPE_FLOAT) :
320 case (Constants::BAM_TAG_TYPE_INT32) :
321 case (Constants::BAM_TAG_TYPE_UINT32) :
322 BamTools::SwapEndian_32p(&tagData[i]);
323 i += sizeof(uint32_t);
326 const string message = string("invalid binary array type: ") + arrayType;
327 SetErrorString("BamAlignment::BuildCharData", message);
335 // invalid tag type-code
337 const string message = string("invalid tag type: ") + type;
338 SetErrorString("BamAlignment::BuildCharData", message);
344 // store tagData in alignment
345 TagData.resize(tagDataLength);
346 memcpy((char*)(TagData.data()), tagData, tagDataLength);
349 // clear core-only flag & return success
350 SupportData.HasCoreOnly = false;
354 /*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
357 Searches for requested tag in BAM tag data.
359 \param[in] tag requested 2-character tag name
360 \param[in,out] pTagData pointer to current position in BamAlignment::TagData
361 \param[in] tagDataLength length of BamAlignment::TagData
362 \param[in,out] numBytesParsed number of bytes parsed so far
364 \return \c true if found
366 \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
367 \a numBytesParsed will correspond to the position in the full TagData string.
370 bool BamAlignment::FindTag(const std::string& tag,
372 const unsigned int& tagDataLength,
373 unsigned int& numBytesParsed) const
376 while ( numBytesParsed < tagDataLength ) {
378 const char* pTagType = pTagData;
379 const char* pTagStorageType = pTagData + 2;
383 // check the current tag, return true on match
384 if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
387 // get the storage class and find the next tag
388 if ( *pTagStorageType == '\0' ) return false;
389 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
390 if ( *pTagData == '\0' ) return false;
393 // checked all tags, none match
397 /*! \fn bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const
398 \brief Retrieves the BAM tag type-code for the array elements associated with requested tag name.
400 \param[in] tag 2-character tag name
401 \param[out] type retrieved (1-character) type-code
403 \return \c true if found. False if not found, or if tag is not an array type.
404 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
406 bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const {
408 // skip if alignment is core-only
409 if ( SupportData.HasCoreOnly ) {
410 // TODO: set error string?
414 // skip if no tags present
415 if ( TagData.empty() ) {
416 // TODO: set error string?
420 // localize the tag data
421 char* pTagData = (char*)TagData.data();
422 const unsigned int tagDataLength = TagData.size();
423 unsigned int numBytesParsed = 0;
425 // if tag not found, return failure
426 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
427 // TODO: set error string?
431 // check that tag type code is array
432 type = *(pTagData - 1);
433 if ( type != Constants::BAM_TAG_TYPE_ARRAY ) {
434 // TODO: set error string
438 // fetch element type
439 const char elementType = *pTagData;
440 switch ( elementType ) {
443 case (Constants::BAM_TAG_TYPE_INT8) :
444 case (Constants::BAM_TAG_TYPE_UINT8) :
445 case (Constants::BAM_TAG_TYPE_INT16) :
446 case (Constants::BAM_TAG_TYPE_UINT16) :
447 case (Constants::BAM_TAG_TYPE_INT32) :
448 case (Constants::BAM_TAG_TYPE_UINT32) :
449 case (Constants::BAM_TAG_TYPE_FLOAT) :
454 //TODO: set error string
458 // if we get here, return success
463 /*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
464 \brief Calculates alignment end position, based on its starting position and CIGAR data.
466 \warning The position returned now represents a zero-based, HALF-OPEN interval.
467 In previous versions of BamTools (0.x & 1.x) all intervals were treated
468 as zero-based, CLOSED.
470 \param[in] usePadded Allow inserted bases to affect the reported position. Default is
471 false, so that reported position stays synced with reference
473 \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
474 false, so that his value represents a standard, half-open interval.
476 \return alignment end position
478 int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
480 // initialize alignment end to starting position
481 int alignEnd = Position;
483 // iterate over cigar operations
484 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
485 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
486 for ( ; cigarIter != cigarEnd; ++cigarIter) {
487 const CigarOp& op = (*cigarIter);
491 // increase end position on CIGAR chars [DMXN=]
492 case Constants::BAM_CIGAR_DEL_CHAR :
493 case Constants::BAM_CIGAR_MATCH_CHAR :
494 case Constants::BAM_CIGAR_MISMATCH_CHAR :
495 case Constants::BAM_CIGAR_REFSKIP_CHAR :
496 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
497 alignEnd += op.Length;
500 // increase end position on insertion, only if @usePadded is true
501 case Constants::BAM_CIGAR_INS_CHAR :
503 alignEnd += op.Length;
506 // all other CIGAR chars do not affect end position
512 // adjust for closedInterval, if requested
513 if ( closedInterval )
520 /*! \fn std::string BamAlignment::GetErrorString(void) const
521 \brief Returns a human-readable description of the last error that occurred
523 This method allows elimination of STDERR pollution. Developers of client code
524 may choose how the messages are displayed to the user, if at all.
526 \return error description
528 std::string BamAlignment::GetErrorString(void) const {
532 /*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
533 \brief Identifies if an alignment has a soft clip. If so, identifies the
534 sizes of the soft clips, as well as their positions in the read and reference.
536 \param[out] clipSizes vector of the sizes of each soft clip in the alignment
537 \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
538 These positions are basically indexes within the read, not genomic positions.
539 \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
540 \param[in] usePadded inserted bases affect reported position. Default is false, so that
541 reported position stays 'sync-ed' with reference coordinates.
543 \return \c true if any soft clips were found in the alignment
545 bool BamAlignment::GetSoftClips(vector<int>& clipSizes,
546 vector<int>& readPositions,
547 vector<int>& genomePositions,
548 bool usePadded) const
550 // initialize positions & flags
551 int refPosition = Position;
552 int readPosition = 0;
553 bool softClipFound = false;
554 bool firstCigarOp = true;
556 // iterate over cigar operations
557 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
558 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
559 for ( ; cigarIter != cigarEnd; ++cigarIter) {
560 const CigarOp& op = (*cigarIter);
564 // increase both read & genome positions on CIGAR chars [DMXN=]
565 case Constants::BAM_CIGAR_DEL_CHAR :
566 case Constants::BAM_CIGAR_MATCH_CHAR :
567 case Constants::BAM_CIGAR_MISMATCH_CHAR :
568 case Constants::BAM_CIGAR_REFSKIP_CHAR :
569 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
570 refPosition += op.Length;
571 readPosition += op.Length;
574 // increase read position on insertion, genome position only if @usePadded is true
575 case Constants::BAM_CIGAR_INS_CHAR :
576 readPosition += op.Length;
578 refPosition += op.Length;
581 case Constants::BAM_CIGAR_SOFTCLIP_CHAR :
583 softClipFound = true;
585 //////////////////////////////////////////////////////////////////////////////
586 // if we are dealing with the *first* CIGAR operation
587 // for this alignment, we increment the read position so that
588 // the read and genome position of the clip are referring to the same base.
589 // For example, in the alignment below, the ref position would be 4, yet
590 // the read position would be 0. Thus, to "sync" the two,
591 // we need to increment the read position by the length of the
593 // Read: ATCGTTTCGTCCCTGC
594 // Ref: GGGATTTCGTCCCTGC
595 // Cigar: SSSSMMMMMMMMMMMM
597 // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
598 //////////////////////////////////////////////////////////////////////////////
600 readPosition += op.Length;
602 // track the soft clip's size, read position, and genome position
603 clipSizes.push_back(op.Length);
604 readPositions.push_back(readPosition);
605 genomePositions.push_back(refPosition);
607 // any other CIGAR operations have no effect
612 // clear our "first pass" flag
613 firstCigarOp = false;
616 // return whether any soft clips found
617 return softClipFound;
620 /*! \fn std::vector<std::string> BamAlignment::GetTagNames(void) const
621 \brief Retrieves the BAM tag names.
623 When paired with GetTagType() and GetTag(), this method allows you
624 to iterate over an alignment's tag data without knowing the names (or types)
627 \return \c vector containing all tag names found (empty if none available)
628 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
630 std::vector<std::string> BamAlignment::GetTagNames(void) const {
632 std::vector<std::string> result;
633 if ( SupportData.HasCoreOnly || TagData.empty() )
636 char* pTagData = (char*)TagData.data();
637 const unsigned int tagDataLength = TagData.size();
638 unsigned int numBytesParsed = 0;
639 while ( numBytesParsed < tagDataLength ) {
641 // get current tag name & type
642 const char* pTagName = pTagData;
643 const char* pTagType = pTagData + 2;
648 result.push_back( std::string(pTagName, 2) );
651 if ( *pTagType == '\0' ) break;
652 if ( !SkipToNextTag(*pTagType, pTagData, numBytesParsed) ) break;
653 if ( *pTagData == '\0' ) break;
659 /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
660 \brief Retrieves the BAM tag type-code associated with requested tag name.
662 \param[in] tag 2-character tag name
663 \param[out] type retrieved (1-character) type-code
665 \return \c true if found
666 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
668 bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
670 // skip if alignment is core-only
671 if ( SupportData.HasCoreOnly ) {
672 // TODO: set error string?
676 // skip if no tags present
677 if ( TagData.empty() ) {
678 // TODO: set error string?
682 // localize the tag data
683 char* pTagData = (char*)TagData.data();
684 const unsigned int tagDataLength = TagData.size();
685 unsigned int numBytesParsed = 0;
687 // if tag not found, return failure
688 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
689 // TODO: set error string?
693 // otherwise, retrieve & validate tag type code
694 type = *(pTagData - 1);
696 case (Constants::BAM_TAG_TYPE_ASCII) :
697 case (Constants::BAM_TAG_TYPE_INT8) :
698 case (Constants::BAM_TAG_TYPE_UINT8) :
699 case (Constants::BAM_TAG_TYPE_INT16) :
700 case (Constants::BAM_TAG_TYPE_UINT16) :
701 case (Constants::BAM_TAG_TYPE_INT32) :
702 case (Constants::BAM_TAG_TYPE_UINT32) :
703 case (Constants::BAM_TAG_TYPE_FLOAT) :
704 case (Constants::BAM_TAG_TYPE_STRING) :
705 case (Constants::BAM_TAG_TYPE_HEX) :
706 case (Constants::BAM_TAG_TYPE_ARRAY) :
711 const string message = string("invalid tag type: ") + type;
712 SetErrorString("BamAlignment::GetTagType", message);
717 /*! \fn bool BamAlignment::HasTag(const std::string& tag) const
718 \brief Returns true if alignment has a record for requested tag.
720 \param[in] tag 2-character tag name
721 \return \c true if alignment has a record for tag
723 bool BamAlignment::HasTag(const std::string& tag) const {
725 // return false if no tag data present
726 if ( SupportData.HasCoreOnly || TagData.empty() )
729 // localize the tag data for lookup
730 char* pTagData = (char*)TagData.data();
731 const unsigned int tagDataLength = TagData.size();
732 unsigned int numBytesParsed = 0;
734 // if result of tag lookup
735 return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
738 /*! \fn bool BamAlignment::IsDuplicate(void) const
739 \return \c true if this read is a PCR duplicate
741 bool BamAlignment::IsDuplicate(void) const {
742 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 );
745 /*! \fn bool BamAlignment::IsFailedQC(void) const
746 \return \c true if this read failed quality control
748 bool BamAlignment::IsFailedQC(void) const {
749 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 );
752 /*! \fn bool BamAlignment::IsFirstMate(void) const
753 \return \c true if alignment is first mate on paired-end read
755 bool BamAlignment::IsFirstMate(void) const {
756 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 );
759 /*! \fn bool BamAlignment::IsMapped(void) const
760 \return \c true if alignment is mapped
762 bool BamAlignment::IsMapped(void) const {
763 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 );
766 /*! \fn bool BamAlignment::IsMateMapped(void) const
767 \return \c true if alignment's mate is mapped
769 bool BamAlignment::IsMateMapped(void) const {
770 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 );
773 /*! \fn bool BamAlignment::IsMateReverseStrand(void) const
774 \return \c true if alignment's mate mapped to reverse strand
776 bool BamAlignment::IsMateReverseStrand(void) const {
777 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 );
780 /*! \fn bool BamAlignment::IsPaired(void) const
781 \return \c true if alignment part of paired-end read
783 bool BamAlignment::IsPaired(void) const {
784 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 );
787 /*! \fn bool BamAlignment::IsPrimaryAlignment(void) const
788 \return \c true if reported position is primary alignment
790 bool BamAlignment::IsPrimaryAlignment(void) const {
791 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 );
794 /*! \fn bool BamAlignment::IsProperPair(void) const
795 \return \c true if alignment is part of read that satisfied paired-end resolution
797 bool BamAlignment::IsProperPair(void) const {
798 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 );
801 /*! \fn bool BamAlignment::IsReverseStrand(void) const
802 \return \c true if alignment mapped to reverse strand
804 bool BamAlignment::IsReverseStrand(void) const {
805 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 );
808 /*! \fn bool BamAlignment::IsSecondMate(void) const
809 \return \c true if alignment is second mate on read
811 bool BamAlignment::IsSecondMate(void) const {
812 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
815 /*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
818 Checks that tag name & type strings are expected sizes.
820 \param tag[in] BAM tag name
821 \param type[in] BAM tag type-code
822 \return \c true if both input strings are valid sizes
824 bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const {
825 return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
826 (type.size() == Constants::BAM_TAG_TYPESIZE);
829 /*! \fn void BamAlignment::RemoveTag(const std::string& tag)
830 \brief Removes field from BAM tags.
832 \param[in] tag 2-character name of field to remove
834 void BamAlignment::RemoveTag(const std::string& tag) {
836 // if char data not populated, do that first
837 if ( SupportData.HasCoreOnly )
840 // skip if no tags available
841 if ( TagData.empty() )
844 // localize the tag data
845 char* pOriginalTagData = (char*)TagData.data();
846 char* pTagData = pOriginalTagData;
847 const unsigned int originalTagDataLength = TagData.size();
848 unsigned int newTagDataLength = 0;
849 unsigned int numBytesParsed = 0;
851 // skip if tag not found
852 if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
855 // otherwise, remove it
856 RaiiBuffer newTagData(originalTagDataLength);
858 // copy original tag data up til desired tag
861 const unsigned int beginningTagDataLength = numBytesParsed;
862 newTagDataLength += beginningTagDataLength;
863 memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
865 // attemp to skip to next tag
866 const char* pTagStorageType = pTagData + 2;
869 if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
871 // squeeze remaining tag data
872 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
873 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
874 memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength );
876 // save modified tag data in alignment
877 TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
881 /*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
884 Sets a formatted error string for this alignment.
886 \param[in] where class/method where error occurred
887 \param[in] what description of error
889 void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const {
890 static const string SEPARATOR = ": ";
891 ErrorString = where + SEPARATOR + what;
894 /*! \fn void BamAlignment::SetIsDuplicate(bool ok)
895 \brief Sets value of "PCR duplicate" flag to \a ok.
897 void BamAlignment::SetIsDuplicate(bool ok) {
898 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE;
899 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE;
902 /*! \fn void BamAlignment::SetIsFailedQC(bool ok)
903 \brief Sets "failed quality control" flag to \a ok.
905 void BamAlignment::SetIsFailedQC(bool ok) {
906 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED;
907 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED;
910 /*! \fn void BamAlignment::SetIsFirstMate(bool ok)
911 \brief Sets "alignment is first mate" flag to \a ok.
913 void BamAlignment::SetIsFirstMate(bool ok) {
914 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1;
915 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1;
918 /*! \fn void BamAlignment::SetIsMapped(bool ok)
919 \brief Sets "alignment is mapped" flag to \a ok.
921 void BamAlignment::SetIsMapped(bool ok) {
922 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED;
923 else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED;
926 /*! \fn void BamAlignment::SetIsMateMapped(bool ok)
927 \brief Sets "alignment's mate is mapped" flag to \a ok.
929 void BamAlignment::SetIsMateMapped(bool ok) {
930 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
931 else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
934 /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
935 \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
937 void BamAlignment::SetIsMateReverseStrand(bool ok) {
938 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
939 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
942 /*! \fn void BamAlignment::SetIsPaired(bool ok)
943 \brief Sets "alignment part of paired-end read" flag to \a ok.
945 void BamAlignment::SetIsPaired(bool ok) {
946 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED;
947 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED;
950 /*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok)
951 \brief Sets "position is primary alignment" flag to \a ok.
953 void BamAlignment::SetIsPrimaryAlignment(bool ok) {
954 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY;
955 else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY;
958 /*! \fn void BamAlignment::SetIsProperPair(bool ok)
959 \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok.
961 void BamAlignment::SetIsProperPair(bool ok) {
962 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR;
963 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR;
966 /*! \fn void BamAlignment::SetIsReverseStrand(bool ok)
967 \brief Sets "alignment mapped to reverse strand" flag to \a ok.
969 void BamAlignment::SetIsReverseStrand(bool ok) {
970 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND;
971 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
974 /*! \fn void BamAlignment::SetIsSecondMate(bool ok)
975 \brief Sets "alignment is second mate on read" flag to \a ok.
977 void BamAlignment::SetIsSecondMate(bool ok) {
978 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2;
979 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
982 /*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
985 Moves to next available tag in tag data string
987 \param[in] storageType BAM tag type-code that determines how far to move cursor
988 \param[in,out] pTagData pointer to current position (cursor) in tag string
989 \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
991 \return \c if storageType was a recognized BAM tag type
993 \post \a pTagData will point to the byte where the next tag data begins.
994 \a numBytesParsed will correspond to the cursor's position in the full TagData string.
996 bool BamAlignment::SkipToNextTag(const char storageType,
998 unsigned int& numBytesParsed) const
1000 switch (storageType) {
1002 case (Constants::BAM_TAG_TYPE_ASCII) :
1003 case (Constants::BAM_TAG_TYPE_INT8) :
1004 case (Constants::BAM_TAG_TYPE_UINT8) :
1009 case (Constants::BAM_TAG_TYPE_INT16) :
1010 case (Constants::BAM_TAG_TYPE_UINT16) :
1011 numBytesParsed += sizeof(uint16_t);
1012 pTagData += sizeof(uint16_t);
1015 case (Constants::BAM_TAG_TYPE_FLOAT) :
1016 case (Constants::BAM_TAG_TYPE_INT32) :
1017 case (Constants::BAM_TAG_TYPE_UINT32) :
1018 numBytesParsed += sizeof(uint32_t);
1019 pTagData += sizeof(uint32_t);
1022 case (Constants::BAM_TAG_TYPE_STRING) :
1023 case (Constants::BAM_TAG_TYPE_HEX) :
1024 while( *pTagData ) {
1028 // increment for null-terminator
1033 case (Constants::BAM_TAG_TYPE_ARRAY) :
1037 const char arrayType = *pTagData;
1041 // read number of elements
1042 int32_t numElements;
1043 memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
1044 numBytesParsed += sizeof(uint32_t);
1045 pTagData += sizeof(uint32_t);
1047 // calculate number of bytes to skip
1048 int bytesToSkip = 0;
1049 switch (arrayType) {
1050 case (Constants::BAM_TAG_TYPE_INT8) :
1051 case (Constants::BAM_TAG_TYPE_UINT8) :
1052 bytesToSkip = numElements;
1054 case (Constants::BAM_TAG_TYPE_INT16) :
1055 case (Constants::BAM_TAG_TYPE_UINT16) :
1056 bytesToSkip = numElements*sizeof(uint16_t);
1058 case (Constants::BAM_TAG_TYPE_FLOAT) :
1059 case (Constants::BAM_TAG_TYPE_INT32) :
1060 case (Constants::BAM_TAG_TYPE_UINT32) :
1061 bytesToSkip = numElements*sizeof(uint32_t);
1064 const string message = string("invalid binary array type: ") + arrayType;
1065 SetErrorString("BamAlignment::SkipToNextTag", message);
1069 // skip binary array contents
1070 numBytesParsed += bytesToSkip;
1071 pTagData += bytesToSkip;
1076 const string message = string("invalid tag type: ") + storageType;
1077 SetErrorString("BamAlignment::SkipToNextTag", message);
1081 // if we get here, tag skipped OK - return success