X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2FBamAlignment.cpp;h=251c5e003e7aa4bf2effcef17fd6919b33f20910;hb=6bb3f902d3f8087112afd4dbf8a783b73b40db8d;hp=0eff5c72e46fba52c137c063989a66dd90e1d52c;hpb=2e049ed7f28881bce09653e60f5aea54bfd7afbf;p=bamtools.git diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 0eff5c7..251c5e0 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -2,13 +2,13 @@ // BamAlignment.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 7 October 2011 (DB) +// Last modified: 4 April 2012 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** -#include -#include +#include "api/BamAlignment.h" +#include "api/BamConstants.h" using namespace BamTools; using namespace std; @@ -25,12 +25,22 @@ using namespace std; */ /*! \var BamAlignment::QueryBases \brief 'original' sequence (as reported from sequencing machine) + + \note Setting this field to "*" indicates that the sequence is not to be stored on output. + In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*"). */ /*! \var BamAlignment::AlignedBases \brief 'aligned' sequence (includes any indels, padding, clipping) + + This field will be completely empty after reading from BamReader/BamMultiReader when + QueryBases is empty. */ /*! \var BamAlignment::Qualities \brief FASTQ qualities (ASCII characters, not numeric values) + + \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored, + but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and + the length of this field should equal the length of QueryBases. */ /*! \var BamAlignment::TagData \brief tag data (use the provided methods to query/modify) @@ -70,8 +80,12 @@ using namespace std; \brief constructor */ BamAlignment::BamAlignment(void) - : RefID(-1) + : Length(0) + , RefID(-1) , Position(-1) + , Bin(0) + , MapQuality(0) + , AlignmentFlag(0) , MateRefID(-1) , MatePosition(-1) , InsertSize(0) @@ -105,31 +119,6 @@ BamAlignment::BamAlignment(const BamAlignment& other) */ BamAlignment::~BamAlignment(void) { } -///*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) -// \brief Adds a field with string data to the BAM tags. - -// Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - -// \param[in] tag 2-character tag name -// \param[in] type 1-character tag type (must be "Z" or "H") -// \param[in] value string data to store -// \return \c true if the \b new tag was added successfully -// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -//*/ - - -///*! \fn bool AddTag(const std::string& tag, const std::vector& values); -// \brief Adds a numeric array field to the BAM tags. - -// Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - -// \param tag 2-character tag name -// \param values vector of uint8_t values to store - -// \return \c true if the \b new tag was added successfully -// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -//*/ - /*! \fn bool BamAlignment::BuildCharData(void) \brief Populates alignment string fields (read name, bases, qualities, tag data). @@ -159,22 +148,17 @@ bool BamAlignment::BuildCharData(void) { const unsigned int tagDataLength = dataLength - tagDataOffset; // check offsets to see what char data exists - const bool hasSeqData = ( seqDataOffset < dataLength ); - const bool hasQualData = ( qualDataOffset < dataLength ); + const bool hasSeqData = ( seqDataOffset < qualDataOffset ); + const bool hasQualData = ( qualDataOffset < tagDataOffset ); const bool hasTagData = ( tagDataOffset < dataLength ); - // set up char buffers - const char* allCharData = SupportData.AllCharData.data(); - const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); - const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); - char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); - // store alignment name (relies on null char in name as terminator) - Name.assign((const char*)(allCharData)); + Name.assign(SupportData.AllCharData.data()); // save query sequence QueryBases.clear(); if ( hasSeqData ) { + const char* seqData = SupportData.AllCharData.data() + seqDataOffset; QueryBases.reserve(SupportData.QuerySequenceLength); for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; @@ -182,13 +166,21 @@ bool BamAlignment::BuildCharData(void) { } } - // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character + // save qualities + Qualities.clear(); if ( hasQualData ) { - Qualities.reserve(SupportData.QuerySequenceLength); - for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { - const char singleQuality = static_cast(qualData[i]+33); - Qualities.append(1, singleQuality); + const char* qualData = SupportData.AllCharData.data() + qualDataOffset; + + // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs + if ( qualData[0] == (char)0xFF ) + Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF); + + // otherwise convert from numeric QV to 'FASTQ-style' ASCII character + else { + Qualities.reserve(SupportData.QuerySequenceLength); + for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) + Qualities.append(1, qualData[i]+33); } } @@ -197,7 +189,7 @@ bool BamAlignment::BuildCharData(void) { // if QueryBases has data, build AlignedBases using CIGAR data // otherwise, AlignedBases will remain empty (this case IS allowed) - if ( !QueryBases.empty() ) { + if ( !QueryBases.empty() && QueryBases != "*" ) { // resize AlignedBases AlignedBases.reserve(SupportData.QuerySequenceLength); @@ -256,6 +248,9 @@ bool BamAlignment::BuildCharData(void) { // save tag data TagData.clear(); if ( hasTagData ) { + + char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset); + if ( IsBigEndian ) { size_t i = 0; while ( i < tagDataLength ) { @@ -351,49 +346,20 @@ bool BamAlignment::BuildCharData(void) { memcpy((char*)(TagData.data()), tagData, tagDataLength); } - // clear the core-only flag + // clear core-only flag & return success SupportData.HasCoreOnly = false; - - // return success return true; } -///*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) -// \brief Edits a BAM tag field containing string data. - -// If \a tag does not exist, a new entry is created. - -// \param tag 2-character tag name -// \param type 1-character tag type (must be "Z" or "H") -// \param value string data to store - -// \return \c true if the tag was modified/created successfully - -// \sa BamAlignment::RemoveTag() -// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -//*/ - -///*! \fn bool EditTag(const std::string& tag, const std::vector& values); -// \brief Edits a BAM tag field containing a numeric array. - -// If \a tag does not exist, a new entry is created. - -// \param tag 2-character tag name -// \param value vector of uint8_t values to store - -// \return \c true if the tag was modified/created successfully -// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -//*/ - -/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) +/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const \internal Searches for requested tag in BAM tag data. - \param tag requested 2-character tag name - \param pTagData pointer to current position in BamAlignment::TagData - \param tagDataLength length of BamAlignment::TagData - \param numBytesParsed number of bytes parsed so far + \param[in] tag requested 2-character tag name + \param[in,out] pTagData pointer to current position in BamAlignment::TagData + \param[in] tagDataLength length of BamAlignment::TagData + \param[in,out] numBytesParsed number of bytes parsed so far \return \c true if found @@ -428,37 +394,22 @@ bool BamAlignment::FindTag(const std::string& tag, return false; } -/*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const - \brief Retrieves value of edit distance tag ("NM"). - - \deprecated Instead use BamAlignment::GetTag() - \code - BamAlignment::GetTag("NM", editDistance); - \endcode - - \param editDistance destination for retrieved value - - \return \c true if found -*/ - -// TODO : REMOVE THIS METHOD -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { - return GetTag("NM", (uint32_t&)editDistance); -} +/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const + \brief Calculates alignment end position, based on its starting position and CIGAR data. -/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool zeroBased = true) const - \brief Calculates alignment end position, based on starting position and CIGAR data. + \warning The position returned now represents a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. - \param usePadded Inserted bases affect reported position. Default is false, so that reported - position stays 'sync-ed' with reference coordinates. - \param zeroBased Return (BAM standard) 0-based coordinate. Setting this to false can be useful - when using BAM data with half-open formats (e.g. BED). + \param[in] usePadded Allow inserted bases to affect the reported position. Default is + false, so that reported position stays synced with reference + coordinates. + \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is + false, so that his value represents a standard, half-open interval. \return alignment end position */ -int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { - - // TODO: Come back to this for coordinate issues !!! +int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const { // initialize alignment end to starting position int alignEnd = Position; @@ -467,77 +418,144 @@ int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { vector::const_iterator cigarIter = CigarData.begin(); vector::const_iterator cigarEnd = CigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - const uint32_t& cigarLength = (*cigarIter).Length; - - if ( cigarType == Constants::BAM_CIGAR_MATCH_CHAR || - cigarType == Constants::BAM_CIGAR_DEL_CHAR || - cigarType == Constants::BAM_CIGAR_REFSKIP_CHAR ) - alignEnd += cigarLength; - else if ( usePadded && cigarType == Constants::BAM_CIGAR_INS_CHAR ) - alignEnd += cigarLength; + const CigarOp& op = (*cigarIter); + + switch ( op.Type ) { + + // increase end position on CIGAR chars [DMXN=] + case Constants::BAM_CIGAR_DEL_CHAR : + case Constants::BAM_CIGAR_MATCH_CHAR : + case Constants::BAM_CIGAR_MISMATCH_CHAR : + case Constants::BAM_CIGAR_REFSKIP_CHAR : + case Constants::BAM_CIGAR_SEQMATCH_CHAR : + alignEnd += op.Length; + break; + + // increase end position on insertion, only if @usePadded is true + case Constants::BAM_CIGAR_INS_CHAR : + if ( usePadded ) + alignEnd += op.Length; + break; + + // all other CIGAR chars do not affect end position + default : + break; + } } - // adjust for zero-based coordinates, if requested - if ( zeroBased ) alignEnd -= 1; + // adjust for closedInterval, if requested + if ( closedInterval ) + alignEnd -= 1; // return result return alignEnd; } /*! \fn std::string BamAlignment::GetErrorString(void) const - \brief Returns a description of the last error that occurred + \brief Returns a human-readable description of the last error that occurred - This method allows elimnation of STDERR pollution. Developers of client code + This method allows elimination of STDERR pollution. Developers of client code may choose how the messages are displayed to the user, if at all. - \return description of last error that occurred + \return error description */ std::string BamAlignment::GetErrorString(void) const { return ErrorString; } -/*! \fn bool BamAlignment::GetReadGroup(std::string& readGroup) const - \brief Retrieves value of read group tag ("RG"). - - \deprecated Instead use BamAlignment::GetTag() - \code - BamAlignment::GetTag("RG", readGroup); - \endcode +/*! \fn bool BamAlignment::GetSoftClips(std::vector& clipSizes, std::vector& readPositions, std::vector& genomePositions, bool usePadded = false) const + \brief Identifies if an alignment has a soft clip. If so, identifies the + sizes of the soft clips, as well as their positions in the read and reference. - \param readGroup destination for retrieved value + \param[out] clipSizes vector of the sizes of each soft clip in the alignment + \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment. + These positions are basically indexes within the read, not genomic positions. + \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment + \param[in] usePadded inserted bases affect reported position. Default is false, so that + reported position stays 'sync-ed' with reference coordinates. - \return \c true if found + \return \c true if any soft clips were found in the alignment */ +bool BamAlignment::GetSoftClips(vector& clipSizes, + vector& readPositions, + vector& genomePositions, + bool usePadded) const +{ + // initialize positions & flags + int refPosition = Position; + int readPosition = 0; + bool softClipFound = false; + bool firstCigarOp = true; -// TODO : REMOVE THIS METHOD -bool BamAlignment::GetReadGroup(std::string& readGroup) const { - return GetTag("RG", readGroup); -} - -///*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const -// \brief Retrieves the string value associated with a BAM tag. - -// \param tag 2-character tag name -// \param destination destination for retrieved value - -// \return \c true if found -//*/ - -///*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const -// \brief Retrieves the numeric array data associated with a BAM tag + // iterate over cigar operations + vector::const_iterator cigarIter = CigarData.begin(); + vector::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + + switch ( op.Type ) { + + // increase both read & genome positions on CIGAR chars [DMXN=] + case Constants::BAM_CIGAR_DEL_CHAR : + case Constants::BAM_CIGAR_MATCH_CHAR : + case Constants::BAM_CIGAR_MISMATCH_CHAR : + case Constants::BAM_CIGAR_REFSKIP_CHAR : + case Constants::BAM_CIGAR_SEQMATCH_CHAR : + refPosition += op.Length; + readPosition += op.Length; + break; + + // increase read position on insertion, genome position only if @usePadded is true + case Constants::BAM_CIGAR_INS_CHAR : + readPosition += op.Length; + if ( usePadded ) + refPosition += op.Length; + break; + + case Constants::BAM_CIGAR_SOFTCLIP_CHAR : + + softClipFound = true; + + ////////////////////////////////////////////////////////////////////////////// + // if we are dealing with the *first* CIGAR operation + // for this alignment, we increment the read position so that + // the read and genome position of the clip are referring to the same base. + // For example, in the alignment below, the ref position would be 4, yet + // the read position would be 0. Thus, to "sync" the two, + // we need to increment the read position by the length of the + // soft clip. + // Read: ATCGTTTCGTCCCTGC + // Ref: GGGATTTCGTCCCTGC + // Cigar: SSSSMMMMMMMMMMMM + // + // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op. + ////////////////////////////////////////////////////////////////////////////// + if ( firstCigarOp ) + readPosition += op.Length; + + // track the soft clip's size, read position, and genome position + clipSizes.push_back(op.Length); + readPositions.push_back(readPosition); + genomePositions.push_back(refPosition); + + // any other CIGAR operations have no effect + default : + break; + } -// \param tag 2-character tag name -// \param destination destination for retrieved data + // clear our "first pass" flag + firstCigarOp = false; + } -// \return \c true if found -//*/ + // return whether any soft clips found + return softClipFound; +} /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const \brief Retrieves the BAM tag type-code associated with requested tag name. - \param tag 2-character tag name - \param type destination for the retrieved (1-character) tag type + \param[in] tag 2-character tag name + \param[out] type retrieved (1-character) type-code \return \c true if found \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. @@ -593,7 +611,8 @@ bool BamAlignment::GetTagType(const std::string& tag, char& type) const { /*! \fn bool BamAlignment::HasTag(const std::string& tag) const \brief Returns true if alignment has a record for requested tag. - \param tag 2-character tag name + + \param[in] tag 2-character tag name \return \c true if alignment has a record for tag */ bool BamAlignment::HasTag(const std::string& tag) const { @@ -688,17 +707,14 @@ bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 ); } -/*! \fn bool BamAlignment::IsValidSize(const string& tag, const string& type) const +/*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const \internal Checks that tag name & type strings are expected sizes. - \a tag should have length - \a type should have length 1 - - \param tag BAM tag name - \param type BAM tag type-code - \return \c true if both \a tag and \a type are correct sizes + \param tag[in] BAM tag name + \param type[in] BAM tag type-code + \return \c true if both input strings are valid sizes */ bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const { return (tag.size() == Constants::BAM_TAG_TAGSIZE) && @@ -707,6 +723,8 @@ bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) /*! \fn void BamAlignment::RemoveTag(const std::string& tag) \brief Removes field from BAM tags. + + \param[in] tag 2-character name of field to remove */ void BamAlignment::RemoveTag(const std::string& tag) { @@ -759,6 +777,9 @@ void BamAlignment::RemoveTag(const std::string& tag) { \internal Sets a formatted error string for this alignment. + + \param[in] where class/method where error occurred + \param[in] what description of error */ void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const { static const string SEPARATOR = ": "; @@ -805,15 +826,6 @@ void BamAlignment::SetIsMateMapped(bool ok) { else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED; } -/*! \fn void BamAlignment::SetIsMateUnmapped(bool ok) - \brief Complement of using SetIsMateMapped(). - \deprecated For sake of symmetry with the query methods - \sa IsMateMapped(), SetIsMateMapped() -*/ -void BamAlignment::SetIsMateUnmapped(bool ok) { - SetIsMateMapped(!ok); -} - /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok) \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok. */ @@ -854,15 +866,6 @@ void BamAlignment::SetIsReverseStrand(bool ok) { else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND; } -/*! \fn void BamAlignment::SetIsSecondaryAlignment(bool ok) - \brief Complement of using SetIsPrimaryAlignment(). - \deprecated For sake of symmetry with the query methods - \sa IsPrimaryAlignment(), SetIsPrimaryAlignment() -*/ -void BamAlignment::SetIsSecondaryAlignment(bool ok) { - SetIsPrimaryAlignment(!ok); -} - /*! \fn void BamAlignment::SetIsSecondMate(bool ok) \brief Sets "alignment is second mate on read" flag to \a ok. */ @@ -871,26 +874,18 @@ void BamAlignment::SetIsSecondMate(bool ok) { else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2; } -/*! \fn void BamAlignment::SetIsUnmapped(bool ok) - \brief Complement of using SetIsMapped(). - \deprecated For sake of symmetry with the query methods - \sa IsMapped(), SetIsMapped() -*/ -void BamAlignment::SetIsUnmapped(bool ok) { - SetIsMapped(!ok); -} - -/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) +/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const \internal Moves to next available tag in tag data string - \param storageType BAM tag type-code that determines how far to move cursor - \param pTagData pointer to current position (cursor) in tag string - \param numBytesParsed report of how many bytes were parsed (cumulatively) + \param[in] storageType BAM tag type-code that determines how far to move cursor + \param[in,out] pTagData pointer to current position (cursor) in tag string + \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively) \return \c if storageType was a recognized BAM tag type - \post \a pTagData will point to the byte where the next tag data begins. + + \post \a pTagData will point to the byte where the next tag data begins. \a numBytesParsed will correspond to the cursor's position in the full TagData string. */ bool BamAlignment::SkipToNextTag(const char storageType,