X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2FBamAlignment.cpp;h=78d7d6b22b8801e374d168a4684ce06cca83d86b;hb=9f1ce8c47aeadb6dc1320b52ee671c3341b97935;hp=7cff4b0da8db0728d7a9fff14bed7d518384e16c;hpb=cdf4bbcb19025398d429035fe672661a8c8d1a80;p=bamtools.git diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 7cff4b0..78d7d6b 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -1,194 +1,17 @@ // *************************************************************************** // BamAlignment.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 April 2011 (DB) +// Last modified: 10 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** -#include -#include +#include "api/BamAlignment.h" +#include "api/BamConstants.h" using namespace BamTools; - -#include -#include -#include -#include -#include -#include -#include -#include using namespace std; -// internal utility methods -namespace BamTools { -namespace Internal { - -/*! \fn bool IsValidSize(const string& tag, const string& type) - \internal - - Checks that tag name & type strings are expected sizes. - \a tag should have length - \a type should have length 1 - - \param tag BAM tag name - \param type BAM tag type-code - - \return \c true if both \a tag and \a type are correct sizes -*/ -bool IsValidSize(const string& tag, const string& type) { - return (tag.size() == Constants::BAM_TAG_TAGSIZE) && - (type.size() == Constants::BAM_TAG_TYPESIZE); -} - -/*! \fn bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) - \internal - - Moves to next available tag in tag data string - - \param storageType BAM tag type-code that determines how far to move cursor - \param pTagData pointer to current position (cursor) in tag string - \param numBytesParsed report of how many bytes were parsed (cumulatively) - - \return \c if storageType was a recognized BAM tag type - \post \a pTagData will point to the byte where the next tag data begins. - \a numBytesParsed will correspond to the cursor's position in the full TagData string. -*/ -bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - - switch (storageType) { - - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - ++numBytesParsed; - ++pTagData; - break; - - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - numBytesParsed += sizeof(uint16_t); - pTagData += sizeof(uint16_t); - break; - - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - numBytesParsed += sizeof(uint32_t); - pTagData += sizeof(uint32_t); - break; - - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - while( *pTagData ) { - ++numBytesParsed; - ++pTagData; - } - // increment for null-terminator - ++numBytesParsed; - ++pTagData; - break; - - case (Constants::BAM_TAG_TYPE_ARRAY) : - - { - // read array type - const char arrayType = *pTagData; - ++numBytesParsed; - ++pTagData; - - // read number of elements - int32_t numElements; - memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped if necessary - numBytesParsed += sizeof(uint32_t); - pTagData += sizeof(uint32_t); - - // calculate number of bytes to skip - int bytesToSkip = 0; - switch (arrayType) { - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - bytesToSkip = numElements; - break; - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - bytesToSkip = numElements*sizeof(uint16_t); - break; - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - bytesToSkip = numElements*sizeof(uint32_t); - break; - default: - cerr << "BamAlignment ERROR: unknown binary array type encountered: " - << arrayType << endl; - return false; - } - - // skip binary array contents - numBytesParsed += bytesToSkip; - pTagData += bytesToSkip; - break; - } - - default: - cerr << "BamAlignment ERROR: unknown tag type encountered" - << storageType << endl; - return false; - } - - // return success - return true; -} - -/*! \fn bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) - \internal - - Searches for requested tag in BAM tag data. - - \param tag requested 2-character tag name - \param pTagData pointer to current position in BamAlignment::TagData - \param tagDataLength length of BamAlignment::TagData - \param numBytesParsed number of bytes parsed so far - - \return \c true if found - - \post If \a tag is found, \a pTagData will point to the byte where the tag data begins. - \a numBytesParsed will correspond to the position in the full TagData string. - -*/ -bool FindTag(const std::string& tag, - char* &pTagData, - const unsigned int& tagDataLength, - unsigned int& numBytesParsed) -{ - - while ( numBytesParsed < tagDataLength ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag, return true on match - if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) - return true; - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - - // checked all tags, none match - return false; -} - -} // namespace Internal -} // namespace BamTools - /*! \class BamTools::BamAlignment \brief The main BAM alignment data structure. @@ -282,729 +105,86 @@ BamAlignment::BamAlignment(const BamAlignment& other) */ BamAlignment::~BamAlignment(void) { } -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) - \brief Adds a field with string data to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param type 1-character tag type (must be "Z" or "H") - \param value string data to store - - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for string value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && - type.at(0) != Constants::BAM_TAG_TYPE_HEX - ) - { - return false; - } - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // otherwise, copy tag data to temp buffer - string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) - \brief Adds a field with unsigned integer data to the BAM tags. +/*! \fn bool BamAlignment::BuildCharData(void) + \brief Populates alignment string fields (read name, bases, qualities, tag data). - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data. + Using that method makes parsing much quicker when only positional data is required. - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") - \param value unsigned int data to store + However, if you later want to access the character data fields from such an alignment, + use this method to populate those fields. Provides ability to do 'lazy evaluation' of + alignment parsing. - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. + \return \c true if character data populated successfully (or was already available to begin with) */ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for uint32_t value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || - type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX || - type.at(0) == Constants::BAM_TAG_TYPE_ARRAY - ) - { - return false; - } - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // otherwise, convert value to string - union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; - un.value = value; - - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(uint32_t)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) - \brief Adds a field with signed integer data to the BAM tags. +bool BamAlignment::BuildCharData(void) { - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + // skip if char data already parsed + if ( !SupportData.HasCoreOnly ) + return true; - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") - \param value signed int data to store + // check system endianness + bool IsBigEndian = BamTools::SystemIsBigEndian(); - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { - return AddTag(tag, type, (const uint32_t&)value); -} + // calculate character lengths/offsets + const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE; + const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4); + const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) - \brief Adds a field with floating-point data to the BAM tags. + // check offsets to see what char data exists + const bool hasSeqData = ( seqDataOffset < dataLength ); + const bool hasQualData = ( qualDataOffset < dataLength ); + const bool hasTagData = ( tagDataOffset < dataLength ); - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + // set up char buffers + const char* allCharData = SupportData.AllCharData.data(); + const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); + const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); + char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "Z", "H", or "B") - \param value float data to store + // store alignment name (relies on null char in name as terminator) + Name.assign((const char*)(allCharData)); - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for float value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX || - type.at(0) == Constants::BAM_TAG_TYPE_ARRAY - ) - { - return false; + // save query sequence + QueryBases.clear(); + if ( hasSeqData ) { + QueryBases.reserve(SupportData.QuerySequenceLength); + for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { + const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + QueryBases.append(1, singleBase); + } } - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // otherwise, convert value to string - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool AddTag(const std::string& tag, const std::vector& values); - \brief Adds a numeric array field to the BAM tags. + // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character + Qualities.clear(); + if ( hasQualData ) { + Qualities.reserve(SupportData.QuerySequenceLength); + for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { + const char singleQuality = static_cast(qualData[i]+33); + Qualities.append(1, singleQuality); + } + } - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + // clear previous AlignedBases + AlignedBases.clear(); - \param tag 2-character tag name - \param values vector of uint8_t values to store + // if QueryBases has data, build AlignedBases using CIGAR data + // otherwise, AlignedBases will remain empty (this case IS allowed) + if ( !QueryBases.empty() ) { - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + // resize AlignedBases + AlignedBases.reserve(SupportData.QuerySequenceLength); - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; + // iterate over CigarOps + int k = 0; + vector::const_iterator cigarIter = CigarData.begin(); + vector::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + const CigarOp& op = (*cigarIter); - // check for valid tag length - if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // build new tag's base information - char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; - memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); - newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; - newTagBase[3] = Constants::BAM_TAG_TYPE_UINT8; - - // add number of array elements to newTagBase - const int32_t numElements = values.size(); - memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); - - // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(uint8_t); - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term - - // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); - - // add vector elements to tag - int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; - for ( int i = 0 ; i < numElements; ++i ) { - const uint8_t value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint8_t), - &value, sizeof(uint8_t)); - } - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool AddTag(const std::string& tag, const std::vector& values); - \brief Adds a numeric array field to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param values vector of int8_t values to store - - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // check for valid tag length - if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // build new tag's base information - char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; - memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); - newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; - newTagBase[3] = Constants::BAM_TAG_TYPE_INT8; - - // add number of array elements to newTagBase - const int32_t numElements = values.size(); - memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); - - // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(int8_t); - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term - - // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); - - // add vector elements to tag - int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; - for ( int i = 0 ; i < numElements; ++i ) { - const int8_t value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(int8_t), - &value, sizeof(int8_t)); - } - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool AddTag(const std::string& tag, const std::vector& values); - \brief Adds a numeric array field to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param values vector of uint16_t values to store - - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // check for valid tag length - if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // build new tag's base information - char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; - memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); - newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; - newTagBase[3] = Constants::BAM_TAG_TYPE_UINT16; - - // add number of array elements to newTagBase - const int32_t numElements = values.size(); - memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); - - // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(uint16_t); - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term - - // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); - - // add vector elements to tag - int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; - for ( int i = 0 ; i < numElements; ++i ) { - const uint16_t value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint16_t), - &value, sizeof(uint16_t)); - } - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool AddTag(const std::string& tag, const std::vector& values); - \brief Adds a numeric array field to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param values vector of int16_t values to store - - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // check for valid tag length - if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // build new tag's base information - char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; - memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); - newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; - newTagBase[3] = Constants::BAM_TAG_TYPE_INT16; - - // add number of array elements to newTagBase - const int32_t numElements = values.size(); - memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); - - // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(int16_t); - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term - - // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); - - // add vector elements to tag - int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; - for ( int i = 0 ; i < numElements; ++i ) { - const int16_t value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(int16_t), - &value, sizeof(int16_t)); - } - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool AddTag(const std::string& tag, const std::vector& values); - \brief Adds a numeric array field to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param values vector of uint32_t values to store - - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // check for valid tag length - if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // build new tag's base information - char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; - memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); - newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; - newTagBase[3] = Constants::BAM_TAG_TYPE_UINT32; - - // add number of array elements to newTagBase - const int32_t numElements = values.size(); - memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); - - // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(uint32_t); - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term - - // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); - - // add vector elements to tag - int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; - for ( int i = 0 ; i < numElements; ++i ) { - const uint32_t value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint32_t), - &value, sizeof(uint32_t)); - } - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool AddTag(const std::string& tag, const std::vector& values); - \brief Adds a numeric array field to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param values vector of int32_t values to store - - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // check for valid tag length - if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // build new tag's base information - char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; - memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); - newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; - newTagBase[3] = Constants::BAM_TAG_TYPE_INT32; - - // add number of array elements to newTagBase - const int32_t numElements = values.size(); - memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); - - // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(int32_t); - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term - - // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); - - // add vector elements to tag - int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; - for ( int i = 0 ; i < numElements; ++i ) { - const int32_t value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(int32_t), - &value, sizeof(int32_t)); - } - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool AddTag(const std::string& tag, const std::vector& values); - \brief Adds a numeric array field to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param values vector of float values to store - - \return \c true if the \b new tag was added successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // check for valid tag length - if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // build new tag's base information - char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; - memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); - newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; - newTagBase[3] = Constants::BAM_TAG_TYPE_FLOAT; - - // add number of array elements to newTagBase - const int32_t numElements = values.size(); - memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); - - // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(float); - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term - - // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); - - // add vector elements to tag - int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; - for ( int i = 0 ; i < numElements; ++i ) { - const float value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(float), - &value, sizeof(float)); - } - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool BamAlignment::BuildCharData(void) - \brief Populates alignment string fields (read name, bases, qualities, tag data). - - An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data. - Using that method makes parsing much quicker when only positional data is required. - - However, if you later want to access the character data fields from such an alignment, - use this method to populate those fields. Provides ability to do 'lazy evaluation' of - alignment parsing. - - \return \c true if character data populated successfully (or was already available to begin with) -*/ -bool BamAlignment::BuildCharData(void) { - - // skip if char data already parsed - if ( !SupportData.HasCoreOnly ) - return true; - - // check system endianness - bool IsBigEndian = BamTools::SystemIsBigEndian(); - - // calculate character lengths/offsets - const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE; - const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength; - const unsigned int tagDataLength = dataLength - tagDataOffset; - - // check offsets to see what char data exists - const bool hasSeqData = ( seqDataOffset < dataLength ); - const bool hasQualData = ( qualDataOffset < dataLength ); - const bool hasTagData = ( tagDataOffset < dataLength ); - - // set up char buffers - const char* allCharData = SupportData.AllCharData.data(); - const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); - const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); - char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); - - // store alignment name (relies on null char in name as terminator) - Name.assign((const char*)(allCharData)); - - // save query sequence - QueryBases.clear(); - if ( hasSeqData ) { - QueryBases.reserve(SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) { - char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; - QueryBases.append(1, singleBase); - } - } - - // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character - Qualities.clear(); - if ( hasQualData ) { - Qualities.reserve(SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); - Qualities.append(1, singleQuality); - } - } - - // clear previous AlignedBases - AlignedBases.clear(); - - // if QueryBases has data, build AlignedBases using CIGAR data - // otherwise, AlignedBases will remain empty (this case IS allowed) - if ( !QueryBases.empty() ) { - - // resize AlignedBases - AlignedBases.reserve(SupportData.QuerySequenceLength); - - // iterate over CigarOps - int k = 0; - vector::const_iterator cigarIter = CigarData.begin(); - vector::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - const CigarOp& op = (*cigarIter); - - switch (op.Type) { + switch ( op.Type ) { // for 'M', 'I', '=', 'X' - write bases case (Constants::BAM_CIGAR_MATCH_CHAR) : @@ -1039,11 +219,11 @@ bool BamAlignment::BuildCharData(void) { case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : break; - // shouldn't get here + // invalid CIGAR op-code default: - cerr << "BamAlignment ERROR: invalid CIGAR operation type: " - << op.Type << endl; - exit(1); + const string message = string("invalid CIGAR operation type: ") + op.Type; + SetErrorString("BamAlignment::BuildCharData", message); + return false; } } } @@ -1052,8 +232,8 @@ bool BamAlignment::BuildCharData(void) { TagData.clear(); if ( hasTagData ) { if ( IsBigEndian ) { - int i = 0; - while ( (unsigned int)i < tagDataLength ) { + size_t i = 0; + while ( i < tagDataLength ) { i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) const char type = tagData[i]; // get tag type at position i @@ -1099,12 +279,12 @@ bool BamAlignment::BuildCharData(void) { // swap endian-ness of number of elements in place, then retrieve for loop BamTools::SwapEndian_32p(&tagData[i]); - int32_t numElements; + uint32_t numElements; memcpy(&numElements, &tagData[i], sizeof(uint32_t)); i += sizeof(uint32_t); // swap endian-ness of array elements - for ( int j = 0; j < numElements; ++j ) { + for ( size_t j = 0; j < numElements; ++j ) { switch (arrayType) { case (Constants::BAM_TAG_TYPE_INT8) : case (Constants::BAM_TAG_TYPE_UINT8) : @@ -1123,9 +303,8 @@ bool BamAlignment::BuildCharData(void) { i += sizeof(uint32_t); break; default: - // error case - cerr << "BamAlignment ERROR: unknown binary array type encountered: " - << arrayType << endl; + const string message = string("invalid binary array type: ") + arrayType; + SetErrorString("BamAlignment::BuildCharData", message); return false; } } @@ -1133,1019 +312,199 @@ bool BamAlignment::BuildCharData(void) { break; } - // shouldn't get here + // invalid tag type-code default : - cerr << "BamAlignment ERROR: invalid tag value type: " - << type << endl; - exit(1); + const string message = string("invalid tag type: ") + type; + SetErrorString("BamAlignment::BuildCharData", message); + return false; } } } // store tagData in alignment TagData.resize(tagDataLength); - memcpy((char*)TagData.data(), tagData, tagDataLength); + memcpy((char*)(TagData.data()), tagData, tagDataLength); } - // clear the core-only flag + // clear core-only flag & return success SupportData.HasCoreOnly = false; - - // return success return true; } -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) - \brief Edits a BAM tag field containing string data. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param type 1-character tag type (must be "Z" or "H") - \param value string data to store - - \return \c true if the tag was modified/created successfully - - \sa BamAlignment::RemoveTag() - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for string value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && - type.at(0) != Constants::BAM_TAG_TYPE_HEX ) - return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + value.size()]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new @value in place of current tag data - const unsigned int dataLength = strlen(value.c_str()); - memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) - return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) - \brief Edits a BAM tag field containing unsigned integer data. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") - \param value unsigned integer data to store - - \return \c true if the tag was modified/created successfully - - \sa BamAlignment::RemoveTag() - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for uint32_t value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || - type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX || - type.at(0) == Constants::BAM_TAG_TYPE_ARRAY - ) - { - return false; - } - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new @value in place of current tag data - union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(uint32_t)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) - return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(uint32_t); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) - \brief Edits a BAM tag field containing signed integer data. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") - \param value signed integer data to store - - \return \c true if the tag was modified/created successfully - - \sa BamAlignment::RemoveTag() - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { - return EditTag(tag, type, (const uint32_t&)value); -} - -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) - \brief Edits a BAM tag field containing floating-point data. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "Z", "H", or "B") - \param value float data to store - - \return \c true if the tag was modified/created successfully - - \sa BamAlignment::RemoveTag() - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for float value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX || - type.at(0) == Constants::BAM_TAG_TYPE_ARRAY - ) - { - return false; - } - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new @value in place of current tag data - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) - return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -/*! \fn bool EditTag(const std::string& tag, const std::vector& values); - \brief Edits a BAM tag field containing a numeric array. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param value vector of uint8_t values to store - - \return \c true if the tag was modified/created successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { - - // can't do anything if TagData not parsed - if ( SupportData.HasCoreOnly ) - return false; - - // remove existing tag if present - if ( HasTag(tag) ) - RemoveTag(tag); - - // add tag record with new values - return AddTag(tag, values); -} - -/*! \fn bool EditTag(const std::string& tag, const std::vector& values); - \brief Edits a BAM tag field containing a numeric array. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param value vector of int8_t values to store - - \return \c true if the tag was modified/created successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { - - // can't do anything if TagData not parsed - if ( SupportData.HasCoreOnly ) - return false; - - // remove existing tag if present - if ( HasTag(tag) ) - RemoveTag(tag); - - // add tag record with new values - return AddTag(tag, values); -} - -/*! \fn bool EditTag(const std::string& tag, const std::vector& values); - \brief Edits a BAM tag field containing a numeric array. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param value vector of uint16_t values to store - - \return \c true if the tag was modified/created successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { - - // can't do anything if TagData not parsed - if ( SupportData.HasCoreOnly ) - return false; - - // remove existing tag if present - if ( HasTag(tag) ) - RemoveTag(tag); - - // add tag record with new values - return AddTag(tag, values); -} - -/*! \fn bool EditTag(const std::string& tag, const std::vector& values); - \brief Edits a BAM tag field containing a numeric array. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param value vector of int16_t values to store - - \return \c true if the tag was modified/created successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { - - // can't do anything if TagData not parsed - if ( SupportData.HasCoreOnly ) - return false; - - // remove existing tag if present - if ( HasTag(tag) ) - RemoveTag(tag); - - // add tag record with new values - return AddTag(tag, values); -} - -/*! \fn bool EditTag(const std::string& tag, const std::vector& values); - \brief Edits a BAM tag field containing a numeric array. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param value vector of uint32_t values to store - - \return \c true if the tag was modified/created successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { - - // can't do anything if TagData not parsed - if ( SupportData.HasCoreOnly ) - return false; - - // remove existing tag if present - if ( HasTag(tag) ) - RemoveTag(tag); - - // add tag record with new values - return AddTag(tag, values); -} - -/*! \fn bool EditTag(const std::string& tag, const std::vector& values); - \brief Edits a BAM tag field containing a numeric array. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param value vector of int32_t values to store - - \return \c true if the tag was modified/created successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { - - // can't do anything if TagData not parsed - if ( SupportData.HasCoreOnly ) - return false; - - // remove existing tag if present - if ( HasTag(tag) ) - RemoveTag(tag); - - // add tag record with new values - return AddTag(tag, values); -} - -/*! \fn bool EditTag(const std::string& tag, const std::vector& values); - \brief Edits a BAM tag field containing a numeric array. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param value vector of float values to store - - \return \c true if the tag was modified/created successfully - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { - - // can't do anything if TagData not parsed - if ( SupportData.HasCoreOnly ) - return false; - - // remove existing tag if present - if ( HasTag(tag) ) - RemoveTag(tag); - - // add tag record with new values - return AddTag(tag, values); -} - -/*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const - \brief Retrieves value of edit distance tag ("NM"). - - \deprecated Instead use BamAlignment::GetTag() - \code - BamAlignment::GetTag("NM", editDistance); - \endcode - - \param editDistance destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { - return GetTag("NM", (uint32_t&)editDistance); -} - -/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool zeroBased = true) const - \brief Calculates alignment end position, based on starting position and CIGAR data. - - \param usePadded Inserted bases affect reported position. Default is false, so that reported - position stays 'sync-ed' with reference coordinates. - \param zeroBased Return (BAM standard) 0-based coordinate. Setting this to false can be useful - when using BAM data with half-open formats (e.g. BED). - - \return alignment end position -*/ -int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { - - // initialize alignment end to starting position - int alignEnd = Position; - - // iterate over cigar operations - vector::const_iterator cigarIter = CigarData.begin(); - vector::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - const uint32_t& cigarLength = (*cigarIter).Length; - - if ( cigarType == Constants::BAM_CIGAR_MATCH_CHAR || - cigarType == Constants::BAM_CIGAR_DEL_CHAR || - cigarType == Constants::BAM_CIGAR_REFSKIP_CHAR ) - alignEnd += cigarLength; - else if ( usePadded && cigarType == Constants::BAM_CIGAR_INS_CHAR ) - alignEnd += cigarLength; - } - - // adjust for zero-based coordinates, if requested - if ( zeroBased ) alignEnd -= 1; - - // return result - return alignEnd; -} - -/*! \fn bool BamAlignment::GetReadGroup(std::string& readGroup) const - \brief Retrieves value of read group tag ("RG"). - - \deprecated Instead use BamAlignment::GetTag() - \code - BamAlignment::GetTag("RG", readGroup); - \endcode - - \param readGroup destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetReadGroup(std::string& readGroup) const { - return GetTag("RG", readGroup); -} - -/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const - \brief Retrieves the string value associated with a BAM tag. - - \param tag 2-character tag name - \param destination destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - const unsigned int dataLength = strlen(pTagData); - destination.clear(); - destination.resize(dataLength); - memcpy( (char*)destination.data(), pTagData, dataLength ); - return true; - } - - // tag not found, return failure - return false; -} - -/*! \fn bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const - \brief Retrieves the unsigned integer value associated with a BAM tag. - - \param tag 2-character tag name - \param destination destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - - // 1 byte data - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - destinationLength = 1; - break; - - // 2 byte data - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - destinationLength = 2; - break; - - // 4 byte data - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - destinationLength = 4; - break; - - // unsupported type for integer destination (float or var-length strings) - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - case (Constants::BAM_TAG_TYPE_ARRAY) : - cerr << "BamAlignment ERROR: cannot store tag of type " << type - << " in integer destination" << endl; - return false; - - // unknown tag type - default: - cerr << "BamAlignment ERROR: unknown tag type encountered: " - << type << endl; - return false; - } - - // store in destination - destination = 0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -/*! \fn bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const - \brief Retrieves the signed integer value associated with a BAM tag. - - \param tag 2-character tag name - \param destination destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { - return GetTag(tag, (uint32_t&)destination); -} - -/*! \fn bool BamAlignment::GetTag(const std::string& tag, float& destination) const - \brief Retrieves the floating-point value associated with a BAM tag. - - \param tag 2-character tag name - \param destination destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, float& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - - // 1 byte data - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - destinationLength = 1; - break; - - // 2 byte data - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - destinationLength = 2; - break; - - // 4 byte data - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - destinationLength = 4; - break; - - // unsupported type (var-length strings) - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - case (Constants::BAM_TAG_TYPE_ARRAY) : - cerr << "BamAlignment ERROR: cannot store tag of type " << type - << " in float destination" << endl; - return false; - - // unknown tag type - default: - cerr << "BamAlignment ERROR: unknown tag type encountered: " - << type << endl; - return false; - } - - // store in destination - destination = 0.0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} +/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const + \internal -/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const - \brief Retrieves the numeric array data associated with a BAM tag + Searches for requested tag in BAM tag data. - \param tag 2-character tag name - \param destination destination for retrieved data + \param[in] tag requested 2-character tag name + \param[in,out] pTagData pointer to current position in BamAlignment::TagData + \param[in] tagDataLength length of BamAlignment::TagData + \param[in,out] numBytesParsed number of bytes parsed so far \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // return false if tag not found - if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // check that tag is array type - const char tagType = *(pTagData - 1); - if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { - cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " - << tag << " in array destination" << endl; - return false; - } - - // calculate length of each element in tag's array - const char elementType = *pTagData; - ++pTagData; - int elementLength = 0; - switch ( elementType ) { - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - elementLength = sizeof(uint8_t); - break; + \post If \a tag is found, \a pTagData will point to the byte where the tag data begins. + \a numBytesParsed will correspond to the position in the full TagData string. - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - elementLength = sizeof(uint16_t); - break; +*/ +bool BamAlignment::FindTag(const std::string& tag, + char*& pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) const +{ - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - elementLength = sizeof(uint32_t); - break; + while ( numBytesParsed < tagDataLength ) { - // unsupported type for integer destination (float or var-length data) - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - case (Constants::BAM_TAG_TYPE_ARRAY) : - cerr << "BamAlignment ERROR: array element type: " << elementType - << " cannot be stored in integer value" << endl; - return false; + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; - // unknown tag type - default: - cerr << "BamAlignment ERROR: unknown element type encountered: " - << elementType << endl; - return false; - } + // check the current tag, return true on match + if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) + return true; - // get number of elements - int32_t numElements; - memcpy(&numElements, pTagData, sizeof(int32_t)); - pTagData += 4; - destination.clear(); - destination.reserve(numElements); - - // read in elements - uint32_t value; - for ( int i = 0 ; i < numElements; ++i ) { - memcpy(&value, pTagData, sizeof(uint32_t)); - pTagData += sizeof(uint32_t); - destination.push_back(value); + // get the storage class and find the next tag + if ( *pTagStorageType == '\0' ) return false; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; + if ( *pTagData == '\0' ) return false; } - // return success + // checked all tags, none match return false; } -/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const - \brief Retrieves the numeric array data associated with a BAM tag +/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const + \brief Calculates alignment end position, based on its starting position and CIGAR data. - \param tag 2-character tag name - \param destination destination for retrieved data + \warning The position returned now represents a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const { + \param[in] usePadded Allow inserted bases to affect the reported position. Default is + false, so that reported position stays synced with reference + coordinates. + \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is + false, so that his value represents a standard, half-open interval. - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; + \return alignment end position +*/ +int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const { - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; + // initialize alignment end to starting position + int alignEnd = Position; - // return false if tag not found - if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; + // iterate over cigar operations + vector::const_iterator cigarIter = CigarData.begin(); + vector::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); - // check that tag is array type - const char tagType = *(pTagData - 1); - if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { - cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " - << tag << " in array destination" << endl; - return false; - } + switch ( op.Type ) { - // calculate length of each element in tag's array - const char elementType = *pTagData; - ++pTagData; - int elementLength = 0; - switch ( elementType ) { - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - elementLength = sizeof(uint8_t); - break; + // increase end position on CIGAR chars [DMXN=] + case Constants::BAM_CIGAR_DEL_CHAR : + case Constants::BAM_CIGAR_MATCH_CHAR : + case Constants::BAM_CIGAR_MISMATCH_CHAR : + case Constants::BAM_CIGAR_REFSKIP_CHAR : + case Constants::BAM_CIGAR_SEQMATCH_CHAR : + alignEnd += op.Length; + break; - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - elementLength = sizeof(uint16_t); - break; + // increase end position on insertion, only if @usePadded is true + case Constants::BAM_CIGAR_INS_CHAR : + if ( usePadded ) + alignEnd += op.Length; + break; - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - elementLength = sizeof(uint32_t); - break; + // all other CIGAR chars do not affect end position + default : + break; + } + } - // unsupported type for integer destination (float or var-length data) - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - case (Constants::BAM_TAG_TYPE_ARRAY) : - cerr << "BamAlignment ERROR: array element type: " << elementType - << " cannot be stored in integer value" << endl; - return false; + // adjust for closedInterval, if requested + if ( closedInterval ) + alignEnd -= 1; - // unknown tag type - default: - cerr << "BamAlignment ERROR: unknown element type encountered: " - << elementType << endl; - return false; - } + // return result + return alignEnd; +} - // get number of elements - int32_t numElements; - memcpy(&numElements, pTagData, sizeof(int32_t)); - pTagData += 4; - destination.clear(); - destination.reserve(numElements); - - // read in elements - int32_t value; - for ( int i = 0 ; i < numElements; ++i ) { - memcpy(&value, pTagData, sizeof(int32_t)); - pTagData += sizeof(int32_t); - destination.push_back(value); - } +/*! \fn std::string BamAlignment::GetErrorString(void) const + \brief Returns a human-readable description of the last error that occurred - // return success - return false; + This method allows elimination of STDERR pollution. Developers of client code + may choose how the messages are displayed to the user, if at all. + \return error description +*/ +std::string BamAlignment::GetErrorString(void) const { + return ErrorString; } -/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const - \brief Retrieves the numeric array data associated with a BAM tag +/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const + \brief Retrieves the BAM tag type-code associated with requested tag name. - \param tag 2-character tag name - \param destination destination for retrieved data + \param[in] tag 2-character tag name + \param[out] type retrieved (1-character) type-code \return \c true if found + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ -bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const { +bool BamAlignment::GetTagType(const std::string& tag, char& type) const { + + // skip if alignment is core-only + if ( SupportData.HasCoreOnly ) { + // TODO: set error string? + return false; + } - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) + // skip if no tags present + if ( TagData.empty() ) { + // TODO: set error string? return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - - // return false if tag not found - if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // check that tag is array type - const char tagType = *(pTagData - 1); - if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { - cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " - << tag << " in array destination" << endl; + + // if tag not found, return failure + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){ + // TODO: set error string? return false; } - // calculate length of each element in tag's array - const char elementType = *pTagData; - ++pTagData; - int elementLength = 0; - switch ( elementType ) { - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - elementLength = sizeof(uint8_t); - break; - + // otherwise, retrieve & validate tag type code + type = *(pTagData - 1); + switch ( type ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : case (Constants::BAM_TAG_TYPE_INT16) : case (Constants::BAM_TAG_TYPE_UINT16) : - elementLength = sizeof(uint16_t); - break; - case (Constants::BAM_TAG_TYPE_INT32) : case (Constants::BAM_TAG_TYPE_UINT32) : case (Constants::BAM_TAG_TYPE_FLOAT) : - elementLength = sizeof(uint32_t); - break; - - // unsupported type for float destination (var-length data) case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : case (Constants::BAM_TAG_TYPE_ARRAY) : - cerr << "BamAlignment ERROR: array element type: " << elementType - << " cannot be stored in float value" << endl; - return false; + return true; // unknown tag type default: - cerr << "BamAlignment ERROR: unknown element type encountered: " - << elementType << endl; + const string message = string("invalid tag type: ") + type; + SetErrorString("BamAlignment::GetTagType", message); return false; } - - // get number of elements - int32_t numElements; - memcpy(&numElements, pTagData, sizeof(int32_t)); - pTagData += 4; - destination.clear(); - destination.reserve(numElements); - - // read in elements - float value; - for ( int i = 0 ; i < numElements; ++i ) { - memcpy(&value, pTagData, sizeof(float)); - pTagData += sizeof(float); - destination.push_back(value); - } - - // return success - return false; -} - -/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const - \brief Retrieves the BAM tag type-code associated with requested tag name. - - \param tag 2-character tag name - \param type destination for the retrieved (1-character) tag type - - \return \c true if found - \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::GetTagType(const std::string& tag, char& type) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // lookup tag - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // retrieve tag type code - type = *(pTagData - 1); - - // validate that type is a proper BAM tag type - switch (type) { - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - case (Constants::BAM_TAG_TYPE_ARRAY) : - return true; - - // unknown tag type - default: - cerr << "BamAlignment ERROR: unknown tag type encountered: " - << type << endl; - return false; - } - } - - // tag not found, return failure - return false; } /*! \fn bool BamAlignment::HasTag(const std::string& tag) const \brief Returns true if alignment has a record for requested tag. - \param tag 2-character tag name + + \param[in] tag 2-character tag name \return \c true if alignment has a record for tag */ bool BamAlignment::HasTag(const std::string& tag) const { @@ -2160,7 +519,7 @@ bool BamAlignment::HasTag(const std::string& tag) const { unsigned int numBytesParsed = 0; // if result of tag lookup - return Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed); + return FindTag(tag, pTagData, tagDataLength, numBytesParsed); } /*! \fn bool BamAlignment::IsDuplicate(void) const @@ -2240,16 +599,34 @@ bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 ); } -/*! \fn bool BamAlignment::RemoveTag(const std::string& tag) +/*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const + \internal + + Checks that tag name & type strings are expected sizes. + + \param tag[in] BAM tag name + \param type[in] BAM tag type-code + \return \c true if both input strings are valid sizes +*/ +bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const { + return (tag.size() == Constants::BAM_TAG_TAGSIZE) && + (type.size() == Constants::BAM_TAG_TYPESIZE); +} + +/*! \fn void BamAlignment::RemoveTag(const std::string& tag) \brief Removes field from BAM tags. - \return \c true if tag was removed successfully (or didn't exist before) + \param[in] tag 2-character name of field to remove */ -bool BamAlignment::RemoveTag(const std::string& tag) { +void BamAlignment::RemoveTag(const std::string& tag) { - // skip if no tag data available - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; + // if char data not populated, do that first + if ( SupportData.HasCoreOnly ) + BuildCharData(); + + // skip if no tags available + if ( TagData.empty() ) + return; // localize the tag data char* pOriginalTagData = (char*)TagData.data(); @@ -2257,38 +634,48 @@ bool BamAlignment::RemoveTag(const std::string& tag) { const unsigned int originalTagDataLength = TagData.size(); unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - - // if tag found - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - char newTagData[originalTagDataLength]; - - // copy original tag data up til desired tag - pTagData -= 3; - numBytesParsed -= 3; - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) - return true; - - // copy everything from current tag (the next one after tag for removal) to end + + // skip if tag not found + if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) + return; + + // otherwise, remove it + RaiiBuffer newTagData(originalTagDataLength); + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed); + + // attemp to skip to next tag + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) { + + // squeeze remaining tag data const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); - - // save new tag data - TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); - return true; + memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength ); + + // save modified tag data in alignment + TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength); } - - // tag not found, no removal - return failure - return false; +} + +/*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const + \internal + + Sets a formatted error string for this alignment. + + \param[in] where class/method where error occurred + \param[in] what description of error +*/ +void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const { + static const string SEPARATOR = ": "; + ErrorString = where + SEPARATOR + what; } /*! \fn void BamAlignment::SetIsDuplicate(bool ok) @@ -2331,15 +718,6 @@ void BamAlignment::SetIsMateMapped(bool ok) { else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED; } -/*! \fn void BamAlignment::SetIsMateUnmapped(bool ok) - \brief Complement of using SetIsMateMapped(). - \deprecated For sake of symmetry with the query methods - \sa IsMateMapped(), SetIsMateMapped() -*/ -void BamAlignment::SetIsMateUnmapped(bool ok) { - SetIsMateMapped(!ok); -} - /*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok) \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok. */ @@ -2380,15 +758,6 @@ void BamAlignment::SetIsReverseStrand(bool ok) { else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND; } -/*! \fn void BamAlignment::SetIsSecondaryAlignment(bool ok) - \brief Complement of using SetIsPrimaryAlignment(). - \deprecated For sake of symmetry with the query methods - \sa IsPrimaryAlignment(), SetIsPrimaryAlignment() -*/ -void BamAlignment::SetIsSecondaryAlignment(bool ok) { - SetIsPrimaryAlignment(!ok); -} - /*! \fn void BamAlignment::SetIsSecondMate(bool ok) \brief Sets "alignment is second mate on read" flag to \a ok. */ @@ -2397,11 +766,105 @@ void BamAlignment::SetIsSecondMate(bool ok) { else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2; } -/*! \fn void BamAlignment::SetIsUnmapped(bool ok) - \brief Complement of using SetIsMapped(). - \deprecated For sake of symmetry with the query methods - \sa IsMapped(), SetIsMapped() +/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const + \internal + + Moves to next available tag in tag data string + + \param[in] storageType BAM tag type-code that determines how far to move cursor + \param[in,out] pTagData pointer to current position (cursor) in tag string + \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively) + + \return \c if storageType was a recognized BAM tag type + + \post \a pTagData will point to the byte where the next tag data begins. + \a numBytesParsed will correspond to the cursor's position in the full TagData string. */ -void BamAlignment::SetIsUnmapped(bool ok) { - SetIsMapped(!ok); +bool BamAlignment::SkipToNextTag(const char storageType, + char*& pTagData, + unsigned int& numBytesParsed) const +{ + switch (storageType) { + + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + numBytesParsed += sizeof(uint16_t); + pTagData += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + while( *pTagData ) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = *pTagData; + ++numBytesParsed; + ++pTagData; + + // read number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + + // calculate number of bytes to skip + int bytesToSkip = 0; + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + bytesToSkip = numElements; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + bytesToSkip = numElements*sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + bytesToSkip = numElements*sizeof(uint32_t); + break; + default: + const string message = string("invalid binary array type: ") + arrayType; + SetErrorString("BamAlignment::SkipToNextTag", message); + return false; + } + + // skip binary array contents + numBytesParsed += bytesToSkip; + pTagData += bytesToSkip; + break; + } + + default: + const string message = string("invalid tag type: ") + storageType; + SetErrorString("BamAlignment::SkipToNextTag", message); + return false; + } + + // if we get here, tag skipped OK - return success + return true; }