1 // ***************************************************************************
2 // BamAlignment.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 17 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/BamConstants.h"
12 using namespace BamTools;
15 /*! \class BamTools::BamAlignment
16 \brief The main BAM alignment data structure.
18 Provides methods to query/modify BAM alignment data fields.
20 /*! \var BamAlignment::Name
23 /*! \var BamAlignment::Length
24 \brief length of query sequence
26 /*! \var BamAlignment::QueryBases
27 \brief 'original' sequence (as reported from sequencing machine)
29 /*! \var BamAlignment::AlignedBases
30 \brief 'aligned' sequence (includes any indels, padding, clipping)
32 /*! \var BamAlignment::Qualities
33 \brief FASTQ qualities (ASCII characters, not numeric values)
35 /*! \var BamAlignment::TagData
36 \brief tag data (use the provided methods to query/modify)
38 /*! \var BamAlignment::RefID
39 \brief ID number for reference sequence
41 /*! \var BamAlignment::Position
42 \brief position (0-based) where alignment starts
44 /*! \var BamAlignment::Bin
45 \brief BAM (standard) index bin number for this alignment
47 /*! \var BamAlignment::MapQuality
48 \brief mapping quality score
50 /*! \var BamAlignment::AlignmentFlag
51 \brief alignment bit-flag (use the provided methods to query/modify)
53 /*! \var BamAlignment::CigarData
54 \brief CIGAR operations for this alignment
56 /*! \var BamAlignment::MateRefID
57 \brief ID number for reference sequence where alignment's mate was aligned
59 /*! \var BamAlignment::MatePosition
60 \brief position (0-based) where alignment's mate starts
62 /*! \var BamAlignment::InsertSize
63 \brief mate-pair insert size
65 /*! \var BamAlignment::Filename
66 \brief name of BAM file which this alignment comes from
69 /*! \fn BamAlignment::BamAlignment(void)
72 BamAlignment::BamAlignment(void)
80 /*! \fn BamAlignment::BamAlignment(const BamAlignment& other)
81 \brief copy constructor
83 BamAlignment::BamAlignment(const BamAlignment& other)
85 , Length(other.Length)
86 , QueryBases(other.QueryBases)
87 , AlignedBases(other.AlignedBases)
88 , Qualities(other.Qualities)
89 , TagData(other.TagData)
91 , Position(other.Position)
93 , MapQuality(other.MapQuality)
94 , AlignmentFlag(other.AlignmentFlag)
95 , CigarData(other.CigarData)
96 , MateRefID(other.MateRefID)
97 , MatePosition(other.MatePosition)
98 , InsertSize(other.InsertSize)
99 , Filename(other.Filename)
100 , SupportData(other.SupportData)
103 /*! \fn BamAlignment::~BamAlignment(void)
106 BamAlignment::~BamAlignment(void) { }
108 /*! \fn bool BamAlignment::BuildCharData(void)
109 \brief Populates alignment string fields (read name, bases, qualities, tag data).
111 An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
112 Using that method makes parsing much quicker when only positional data is required.
114 However, if you later want to access the character data fields from such an alignment,
115 use this method to populate those fields. Provides ability to do 'lazy evaluation' of
118 \return \c true if character data populated successfully (or was already available to begin with)
120 bool BamAlignment::BuildCharData(void) {
122 // skip if char data already parsed
123 if ( !SupportData.HasCoreOnly )
126 // check system endianness
127 bool IsBigEndian = BamTools::SystemIsBigEndian();
129 // calculate character lengths/offsets
130 const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
131 const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4);
132 const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
133 const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
134 const unsigned int tagDataLength = dataLength - tagDataOffset;
136 // check offsets to see what char data exists
137 const bool hasSeqData = ( seqDataOffset < dataLength );
138 const bool hasQualData = ( qualDataOffset < dataLength );
139 const bool hasTagData = ( tagDataOffset < dataLength );
141 // set up char buffers
142 const char* allCharData = SupportData.AllCharData.data();
143 const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
144 const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
145 char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
147 // store alignment name (relies on null char in name as terminator)
148 Name.assign((const char*)(allCharData));
150 // save query sequence
153 QueryBases.reserve(SupportData.QuerySequenceLength);
154 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
155 const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
156 QueryBases.append(1, singleBase);
160 // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
163 Qualities.reserve(SupportData.QuerySequenceLength);
164 for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
165 const char singleQuality = static_cast<const char>(qualData[i]+33);
166 Qualities.append(1, singleQuality);
170 // clear previous AlignedBases
171 AlignedBases.clear();
173 // if QueryBases has data, build AlignedBases using CIGAR data
174 // otherwise, AlignedBases will remain empty (this case IS allowed)
175 if ( !QueryBases.empty() ) {
177 // resize AlignedBases
178 AlignedBases.reserve(SupportData.QuerySequenceLength);
180 // iterate over CigarOps
182 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
183 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
184 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
185 const CigarOp& op = (*cigarIter);
189 // for 'M', 'I', '=', 'X' - write bases
190 case (Constants::BAM_CIGAR_MATCH_CHAR) :
191 case (Constants::BAM_CIGAR_INS_CHAR) :
192 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) :
193 case (Constants::BAM_CIGAR_MISMATCH_CHAR) :
194 AlignedBases.append(QueryBases.substr(k, op.Length));
197 // for 'S' - soft clip, do not write bases
198 // but increment placeholder 'k'
199 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) :
203 // for 'D' - write gap character
204 case (Constants::BAM_CIGAR_DEL_CHAR) :
205 AlignedBases.append(op.Length, Constants::BAM_DNA_DEL);
208 // for 'P' - write padding character
209 case (Constants::BAM_CIGAR_PAD_CHAR) :
210 AlignedBases.append( op.Length, Constants::BAM_DNA_PAD );
213 // for 'N' - write N's, skip bases in original query sequence
214 case (Constants::BAM_CIGAR_REFSKIP_CHAR) :
215 AlignedBases.append( op.Length, Constants::BAM_DNA_N );
218 // for 'H' - hard clip, do nothing to AlignedBases, move to next op
219 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) :
222 // invalid CIGAR op-code
224 const string message = string("invalid CIGAR operation type: ") + op.Type;
225 SetErrorString("BamAlignment::BuildCharData", message);
236 while ( i < tagDataLength ) {
238 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
239 const char type = tagData[i]; // get tag type at position i
240 ++i; // move i past tag type
244 case(Constants::BAM_TAG_TYPE_ASCII) :
245 case(Constants::BAM_TAG_TYPE_INT8) :
246 case(Constants::BAM_TAG_TYPE_UINT8) :
247 // no endian swapping necessary for single-byte data
251 case(Constants::BAM_TAG_TYPE_INT16) :
252 case(Constants::BAM_TAG_TYPE_UINT16) :
253 BamTools::SwapEndian_16p(&tagData[i]);
254 i += sizeof(uint16_t);
257 case(Constants::BAM_TAG_TYPE_FLOAT) :
258 case(Constants::BAM_TAG_TYPE_INT32) :
259 case(Constants::BAM_TAG_TYPE_UINT32) :
260 BamTools::SwapEndian_32p(&tagData[i]);
261 i += sizeof(uint32_t);
264 case(Constants::BAM_TAG_TYPE_HEX) :
265 case(Constants::BAM_TAG_TYPE_STRING) :
266 // no endian swapping necessary for hex-string/string data
269 // increment one more for null terminator
273 case(Constants::BAM_TAG_TYPE_ARRAY) :
277 const char arrayType = tagData[i];
280 // swap endian-ness of number of elements in place, then retrieve for loop
281 BamTools::SwapEndian_32p(&tagData[i]);
282 uint32_t numElements;
283 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
284 i += sizeof(uint32_t);
286 // swap endian-ness of array elements
287 for ( size_t j = 0; j < numElements; ++j ) {
289 case (Constants::BAM_TAG_TYPE_INT8) :
290 case (Constants::BAM_TAG_TYPE_UINT8) :
291 // no endian-swapping necessary
294 case (Constants::BAM_TAG_TYPE_INT16) :
295 case (Constants::BAM_TAG_TYPE_UINT16) :
296 BamTools::SwapEndian_16p(&tagData[i]);
297 i += sizeof(uint16_t);
299 case (Constants::BAM_TAG_TYPE_FLOAT) :
300 case (Constants::BAM_TAG_TYPE_INT32) :
301 case (Constants::BAM_TAG_TYPE_UINT32) :
302 BamTools::SwapEndian_32p(&tagData[i]);
303 i += sizeof(uint32_t);
306 const string message = string("invalid binary array type: ") + arrayType;
307 SetErrorString("BamAlignment::BuildCharData", message);
315 // invalid tag type-code
317 const string message = string("invalid tag type: ") + type;
318 SetErrorString("BamAlignment::BuildCharData", message);
324 // store tagData in alignment
325 TagData.resize(tagDataLength);
326 memcpy((char*)(TagData.data()), tagData, tagDataLength);
329 // clear core-only flag & return success
330 SupportData.HasCoreOnly = false;
334 /*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
337 Searches for requested tag in BAM tag data.
339 \param[in] tag requested 2-character tag name
340 \param[in,out] pTagData pointer to current position in BamAlignment::TagData
341 \param[in] tagDataLength length of BamAlignment::TagData
342 \param[in,out] numBytesParsed number of bytes parsed so far
344 \return \c true if found
346 \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
347 \a numBytesParsed will correspond to the position in the full TagData string.
350 bool BamAlignment::FindTag(const std::string& tag,
352 const unsigned int& tagDataLength,
353 unsigned int& numBytesParsed) const
356 while ( numBytesParsed < tagDataLength ) {
358 const char* pTagType = pTagData;
359 const char* pTagStorageType = pTagData + 2;
363 // check the current tag, return true on match
364 if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
367 // get the storage class and find the next tag
368 if ( *pTagStorageType == '\0' ) return false;
369 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
370 if ( *pTagData == '\0' ) return false;
373 // checked all tags, none match
377 /*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
378 \brief Calculates alignment end position, based on its starting position and CIGAR data.
380 \warning The position returned now represents a zero-based, HALF-OPEN interval.
381 In previous versions of BamTools (0.x & 1.x) all intervals were treated
382 as zero-based, CLOSED.
384 \param[in] usePadded Allow inserted bases to affect the reported position. Default is
385 false, so that reported position stays synced with reference
387 \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
388 false, so that his value represents a standard, half-open interval.
390 \return alignment end position
392 int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
394 // initialize alignment end to starting position
395 int alignEnd = Position;
397 // iterate over cigar operations
398 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
399 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
400 for ( ; cigarIter != cigarEnd; ++cigarIter) {
401 const CigarOp& op = (*cigarIter);
405 // increase end position on CIGAR chars [DMXN=]
406 case Constants::BAM_CIGAR_DEL_CHAR :
407 case Constants::BAM_CIGAR_MATCH_CHAR :
408 case Constants::BAM_CIGAR_MISMATCH_CHAR :
409 case Constants::BAM_CIGAR_REFSKIP_CHAR :
410 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
411 alignEnd += op.Length;
414 // increase end position on insertion, only if @usePadded is true
415 case Constants::BAM_CIGAR_INS_CHAR :
417 alignEnd += op.Length;
420 // all other CIGAR chars do not affect end position
426 // adjust for closedInterval, if requested
427 if ( closedInterval )
434 /*! \fn std::string BamAlignment::GetErrorString(void) const
435 \brief Returns a human-readable description of the last error that occurred
437 This method allows elimination of STDERR pollution. Developers of client code
438 may choose how the messages are displayed to the user, if at all.
440 \return error description
442 std::string BamAlignment::GetErrorString(void) const {
446 /*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
447 \brief Identifies if an alignment has a soft clip. If so, identifies the
448 sizes of the soft clips, as well as their positions in the read and reference.
450 \param[out] clipSizes vector of the sizes of each soft clip in the alignment
451 \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
452 These positions are basically indexes within the read, not genomic positions.
453 \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
454 \param[in] usePadded inserted bases affect reported position. Default is false, so that
455 reported position stays 'sync-ed' with reference coordinates.
457 \return \c true if any soft clips were found in the alignment
459 bool BamAlignment::GetSoftClips(vector<int>& clipSizes,
460 vector<int>& readPositions,
461 vector<int>& genomePositions,
462 bool usePadded) const
464 // initialize positions & flags
465 int refPosition = Position;
466 int readPosition = 0;
467 bool softClipFound = false;
468 bool firstCigarOp = true;
470 // iterate over cigar operations
471 vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
472 vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
473 for ( ; cigarIter != cigarEnd; ++cigarIter) {
474 const CigarOp& op = (*cigarIter);
478 // increase both read & genome positions on CIGAR chars [DMXN=]
479 case Constants::BAM_CIGAR_DEL_CHAR :
480 case Constants::BAM_CIGAR_MATCH_CHAR :
481 case Constants::BAM_CIGAR_MISMATCH_CHAR :
482 case Constants::BAM_CIGAR_REFSKIP_CHAR :
483 case Constants::BAM_CIGAR_SEQMATCH_CHAR :
484 refPosition += op.Length;
485 readPosition += op.Length;
488 // increase read position on insertion, genome position only if @usePadded is true
489 case Constants::BAM_CIGAR_INS_CHAR :
490 readPosition += op.Length;
492 refPosition += op.Length;
495 case Constants::BAM_CIGAR_SOFTCLIP_CHAR :
497 softClipFound = true;
499 //////////////////////////////////////////////////////////////////////////////
500 // if we are dealing with the *first* CIGAR operation
501 // for this alignment, we increment the read position so that
502 // the read and genome position of the clip are referring to the same base.
503 // For example, in the alignment below, the ref position would be 4, yet
504 // the read position would be 0. Thus, to "sync" the two,
505 // we need to increment the read position by the length of the
507 // Read: ATCGTTTCGTCCCTGC
508 // Ref: GGGATTTCGTCCCTGC
509 // Cigar: SSSSMMMMMMMMMMMM
511 // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
512 //////////////////////////////////////////////////////////////////////////////
514 readPosition += op.Length;
516 // track the soft clip's size, read position, and genome position
517 clipSizes.push_back(op.Length);
518 readPositions.push_back(readPosition);
519 genomePositions.push_back(refPosition);
521 // any other CIGAR operations have no effect
526 // clear our "first pass" flag
527 firstCigarOp = false;
530 // return whether any soft clips found
531 return softClipFound;
534 /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
535 \brief Retrieves the BAM tag type-code associated with requested tag name.
537 \param[in] tag 2-character tag name
538 \param[out] type retrieved (1-character) type-code
540 \return \c true if found
541 \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
543 bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
545 // skip if alignment is core-only
546 if ( SupportData.HasCoreOnly ) {
547 // TODO: set error string?
551 // skip if no tags present
552 if ( TagData.empty() ) {
553 // TODO: set error string?
557 // localize the tag data
558 char* pTagData = (char*)TagData.data();
559 const unsigned int tagDataLength = TagData.size();
560 unsigned int numBytesParsed = 0;
562 // if tag not found, return failure
563 if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
564 // TODO: set error string?
568 // otherwise, retrieve & validate tag type code
569 type = *(pTagData - 1);
571 case (Constants::BAM_TAG_TYPE_ASCII) :
572 case (Constants::BAM_TAG_TYPE_INT8) :
573 case (Constants::BAM_TAG_TYPE_UINT8) :
574 case (Constants::BAM_TAG_TYPE_INT16) :
575 case (Constants::BAM_TAG_TYPE_UINT16) :
576 case (Constants::BAM_TAG_TYPE_INT32) :
577 case (Constants::BAM_TAG_TYPE_UINT32) :
578 case (Constants::BAM_TAG_TYPE_FLOAT) :
579 case (Constants::BAM_TAG_TYPE_STRING) :
580 case (Constants::BAM_TAG_TYPE_HEX) :
581 case (Constants::BAM_TAG_TYPE_ARRAY) :
586 const string message = string("invalid tag type: ") + type;
587 SetErrorString("BamAlignment::GetTagType", message);
592 /*! \fn bool BamAlignment::HasTag(const std::string& tag) const
593 \brief Returns true if alignment has a record for requested tag.
595 \param[in] tag 2-character tag name
596 \return \c true if alignment has a record for tag
598 bool BamAlignment::HasTag(const std::string& tag) const {
600 // return false if no tag data present
601 if ( SupportData.HasCoreOnly || TagData.empty() )
604 // localize the tag data for lookup
605 char* pTagData = (char*)TagData.data();
606 const unsigned int tagDataLength = TagData.size();
607 unsigned int numBytesParsed = 0;
609 // if result of tag lookup
610 return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
613 /*! \fn bool BamAlignment::IsDuplicate(void) const
614 \return \c true if this read is a PCR duplicate
616 bool BamAlignment::IsDuplicate(void) const {
617 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 );
620 /*! \fn bool BamAlignment::IsFailedQC(void) const
621 \return \c true if this read failed quality control
623 bool BamAlignment::IsFailedQC(void) const {
624 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 );
627 /*! \fn bool BamAlignment::IsFirstMate(void) const
628 \return \c true if alignment is first mate on paired-end read
630 bool BamAlignment::IsFirstMate(void) const {
631 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 );
634 /*! \fn bool BamAlignment::IsMapped(void) const
635 \return \c true if alignment is mapped
637 bool BamAlignment::IsMapped(void) const {
638 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 );
641 /*! \fn bool BamAlignment::IsMateMapped(void) const
642 \return \c true if alignment's mate is mapped
644 bool BamAlignment::IsMateMapped(void) const {
645 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 );
648 /*! \fn bool BamAlignment::IsMateReverseStrand(void) const
649 \return \c true if alignment's mate mapped to reverse strand
651 bool BamAlignment::IsMateReverseStrand(void) const {
652 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 );
655 /*! \fn bool BamAlignment::IsPaired(void) const
656 \return \c true if alignment part of paired-end read
658 bool BamAlignment::IsPaired(void) const {
659 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 );
662 /*! \fn bool BamAlignment::IsPrimaryAlignment(void) const
663 \return \c true if reported position is primary alignment
665 bool BamAlignment::IsPrimaryAlignment(void) const {
666 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 );
669 /*! \fn bool BamAlignment::IsProperPair(void) const
670 \return \c true if alignment is part of read that satisfied paired-end resolution
672 bool BamAlignment::IsProperPair(void) const {
673 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 );
676 /*! \fn bool BamAlignment::IsReverseStrand(void) const
677 \return \c true if alignment mapped to reverse strand
679 bool BamAlignment::IsReverseStrand(void) const {
680 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 );
683 /*! \fn bool BamAlignment::IsSecondMate(void) const
684 \return \c true if alignment is second mate on read
686 bool BamAlignment::IsSecondMate(void) const {
687 return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
690 /*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
693 Checks that tag name & type strings are expected sizes.
695 \param tag[in] BAM tag name
696 \param type[in] BAM tag type-code
697 \return \c true if both input strings are valid sizes
699 bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const {
700 return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
701 (type.size() == Constants::BAM_TAG_TYPESIZE);
704 /*! \fn void BamAlignment::RemoveTag(const std::string& tag)
705 \brief Removes field from BAM tags.
707 \param[in] tag 2-character name of field to remove
709 void BamAlignment::RemoveTag(const std::string& tag) {
711 // if char data not populated, do that first
712 if ( SupportData.HasCoreOnly )
715 // skip if no tags available
716 if ( TagData.empty() )
719 // localize the tag data
720 char* pOriginalTagData = (char*)TagData.data();
721 char* pTagData = pOriginalTagData;
722 const unsigned int originalTagDataLength = TagData.size();
723 unsigned int newTagDataLength = 0;
724 unsigned int numBytesParsed = 0;
726 // skip if tag not found
727 if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
730 // otherwise, remove it
731 RaiiBuffer newTagData(originalTagDataLength);
733 // copy original tag data up til desired tag
736 const unsigned int beginningTagDataLength = numBytesParsed;
737 newTagDataLength += beginningTagDataLength;
738 memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
740 // attemp to skip to next tag
741 const char* pTagStorageType = pTagData + 2;
744 if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
746 // squeeze remaining tag data
747 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
748 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
749 memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength );
751 // save modified tag data in alignment
752 TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
756 /*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
759 Sets a formatted error string for this alignment.
761 \param[in] where class/method where error occurred
762 \param[in] what description of error
764 void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const {
765 static const string SEPARATOR = ": ";
766 ErrorString = where + SEPARATOR + what;
769 /*! \fn void BamAlignment::SetIsDuplicate(bool ok)
770 \brief Sets value of "PCR duplicate" flag to \a ok.
772 void BamAlignment::SetIsDuplicate(bool ok) {
773 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE;
774 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE;
777 /*! \fn void BamAlignment::SetIsFailedQC(bool ok)
778 \brief Sets "failed quality control" flag to \a ok.
780 void BamAlignment::SetIsFailedQC(bool ok) {
781 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED;
782 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED;
785 /*! \fn void BamAlignment::SetIsFirstMate(bool ok)
786 \brief Sets "alignment is first mate" flag to \a ok.
788 void BamAlignment::SetIsFirstMate(bool ok) {
789 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1;
790 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1;
793 /*! \fn void BamAlignment::SetIsMapped(bool ok)
794 \brief Sets "alignment is mapped" flag to \a ok.
796 void BamAlignment::SetIsMapped(bool ok) {
797 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED;
798 else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED;
801 /*! \fn void BamAlignment::SetIsMateMapped(bool ok)
802 \brief Sets "alignment's mate is mapped" flag to \a ok.
804 void BamAlignment::SetIsMateMapped(bool ok) {
805 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
806 else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
809 /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
810 \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
812 void BamAlignment::SetIsMateReverseStrand(bool ok) {
813 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
814 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
817 /*! \fn void BamAlignment::SetIsPaired(bool ok)
818 \brief Sets "alignment part of paired-end read" flag to \a ok.
820 void BamAlignment::SetIsPaired(bool ok) {
821 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED;
822 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED;
825 /*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok)
826 \brief Sets "position is primary alignment" flag to \a ok.
828 void BamAlignment::SetIsPrimaryAlignment(bool ok) {
829 if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY;
830 else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY;
833 /*! \fn void BamAlignment::SetIsProperPair(bool ok)
834 \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok.
836 void BamAlignment::SetIsProperPair(bool ok) {
837 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR;
838 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR;
841 /*! \fn void BamAlignment::SetIsReverseStrand(bool ok)
842 \brief Sets "alignment mapped to reverse strand" flag to \a ok.
844 void BamAlignment::SetIsReverseStrand(bool ok) {
845 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND;
846 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
849 /*! \fn void BamAlignment::SetIsSecondMate(bool ok)
850 \brief Sets "alignment is second mate on read" flag to \a ok.
852 void BamAlignment::SetIsSecondMate(bool ok) {
853 if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2;
854 else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
857 /*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
860 Moves to next available tag in tag data string
862 \param[in] storageType BAM tag type-code that determines how far to move cursor
863 \param[in,out] pTagData pointer to current position (cursor) in tag string
864 \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
866 \return \c if storageType was a recognized BAM tag type
868 \post \a pTagData will point to the byte where the next tag data begins.
869 \a numBytesParsed will correspond to the cursor's position in the full TagData string.
871 bool BamAlignment::SkipToNextTag(const char storageType,
873 unsigned int& numBytesParsed) const
875 switch (storageType) {
877 case (Constants::BAM_TAG_TYPE_ASCII) :
878 case (Constants::BAM_TAG_TYPE_INT8) :
879 case (Constants::BAM_TAG_TYPE_UINT8) :
884 case (Constants::BAM_TAG_TYPE_INT16) :
885 case (Constants::BAM_TAG_TYPE_UINT16) :
886 numBytesParsed += sizeof(uint16_t);
887 pTagData += sizeof(uint16_t);
890 case (Constants::BAM_TAG_TYPE_FLOAT) :
891 case (Constants::BAM_TAG_TYPE_INT32) :
892 case (Constants::BAM_TAG_TYPE_UINT32) :
893 numBytesParsed += sizeof(uint32_t);
894 pTagData += sizeof(uint32_t);
897 case (Constants::BAM_TAG_TYPE_STRING) :
898 case (Constants::BAM_TAG_TYPE_HEX) :
903 // increment for null-terminator
908 case (Constants::BAM_TAG_TYPE_ARRAY) :
912 const char arrayType = *pTagData;
916 // read number of elements
918 memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
919 numBytesParsed += sizeof(uint32_t);
920 pTagData += sizeof(uint32_t);
922 // calculate number of bytes to skip
925 case (Constants::BAM_TAG_TYPE_INT8) :
926 case (Constants::BAM_TAG_TYPE_UINT8) :
927 bytesToSkip = numElements;
929 case (Constants::BAM_TAG_TYPE_INT16) :
930 case (Constants::BAM_TAG_TYPE_UINT16) :
931 bytesToSkip = numElements*sizeof(uint16_t);
933 case (Constants::BAM_TAG_TYPE_FLOAT) :
934 case (Constants::BAM_TAG_TYPE_INT32) :
935 case (Constants::BAM_TAG_TYPE_UINT32) :
936 bytesToSkip = numElements*sizeof(uint32_t);
939 const string message = string("invalid binary array type: ") + arrayType;
940 SetErrorString("BamAlignment::SkipToNextTag", message);
944 // skip binary array contents
945 numBytesParsed += bytesToSkip;
946 pTagData += bytesToSkip;
951 const string message = string("invalid tag type: ") + storageType;
952 SetErrorString("BamAlignment::SkipToNextTag", message);
956 // if we get here, tag skipped OK - return success