X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2FBamAlignment.cpp;h=980303ae6050f116b4e9197de9d10d5e259af9ec;hb=11fabb69eb8c86635dd9498679b72bf78b3af3d1;hp=162e1958b38802c8984eadf636726ef6efaf479f;hpb=8c80d760637f8df39262683cd2570f0589423d36;p=bamtools.git diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 162e195..980303a 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -1,9 +1,8 @@ // *************************************************************************** // BamAlignment.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 4 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -17,135 +16,11 @@ using namespace BamTools; #include #include #include +#include #include #include using namespace std; -// internal utility methods -namespace BamTools { -namespace Internal { - -/*! \fn bool IsValidSize(const string& tag, const string& type) - \internal - - Checks that tag name & type strings are expected sizes. - \a tag should have length - \a type should have length 1 - - \param tag BAM tag name - \param type BAM tag type-code - - \return \c true if both \a tag and \a type are correct sizes -*/ -bool IsValidSize(const string& tag, const string& type) { - return (tag.size() == Constants::BAM_TAG_TAGSIZE) && - (type.size() == Constants::BAM_TAG_TYPESIZE); -} - -/*! \fn bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) - \internal - - Moves to next available tag in tag data string - - \param storageType BAM tag type-code that determines how far to move cursor - \param pTagData pointer to current position (cursor) in tag string - \param numBytesParsed report of how many bytes were parsed (cumulatively) - - \return \c if storageType was a recognized BAM tag type - \post \a pTagData will point to the byte where the next tag data begins. - \a numBytesParsed will correspond to the cursor's position in the full TagData string. -*/ -bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - - switch (storageType) { - - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - ++numBytesParsed; - ++pTagData; - break; - - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - numBytesParsed += 2; - pTagData += 2; - break; - - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - numBytesParsed += 4; - pTagData += 4; - break; - - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - // increment for null-terminator - ++numBytesParsed; - ++pTagData; - break; - - default: - // error case - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", storageType); - return false; - } - - // return success - return true; -} - -/*! \fn bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) - \internal - - Searches for requested tag in BAM tag data. - - \param tag requested 2-character tag name - \param pTagData pointer to current position in BamAlignment::TagData - \param tagDataLength length of BamAlignment::TagData - \param numBytesParsed number of bytes parsed so far - - \return \c true if found - - \post If \a tag is found, \a pTagData will point to the byte where the tag data begins. - \a numBytesParsed will correspond to the position in the full TagData string. - -*/ -bool FindTag(const std::string& tag, - char* &pTagData, - const unsigned int& tagDataLength, - unsigned int& numBytesParsed) -{ - - while ( numBytesParsed < tagDataLength ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag, return true on match - if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) - return true; - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - - // checked all tags, none match - return false; -} - -} // namespace Internal -} // namespace BamTools - /*! \class BamTools::BamAlignment \brief The main BAM alignment data structure. @@ -239,189 +114,30 @@ BamAlignment::BamAlignment(const BamAlignment& other) */ BamAlignment::~BamAlignment(void) { } -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) - \brief Adds a field with string data to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param type 1-character tag type (must be "Z" or "H") - \param value string data to store - - \return \c true if the \b new tag was added successfully - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for string value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && - type.at(0) != Constants::BAM_TAG_TYPE_HEX ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // otherwise, copy tag data to temp buffer - string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} +///*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) +// \brief Adds a field with string data to the BAM tags. -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) - \brief Adds a field with unsigned integer data to the BAM tags. +// Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. +// \param[in] tag 2-character tag name +// \param[in] type 1-character tag type (must be "Z" or "H") +// \param[in] value string data to store +// \return \c true if the \b new tag was added successfully +// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +//*/ - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") - \param value unsigned int data to store - \return \c true if the \b new tag was added successfully - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for uint32_t value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || - type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // otherwise, convert value to string - union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; - un.value = value; - - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(uint32_t)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) - \brief Adds a field with signed integer data to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. +///*! \fn bool AddTag(const std::string& tag, const std::vector& values); +// \brief Adds a numeric array field to the BAM tags. - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") - \param value signed int data to store +// Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - \return \c true if the \b new tag was added successfully - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { - return AddTag(tag, type, (const uint32_t&)value); -} - -/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) - \brief Adds a field with floating-point data to the BAM tags. - - Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. - - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "Z" or "H") - \param value float data to store +// \param tag 2-character tag name +// \param values vector of uint8_t values to store - \return \c true if the \b new tag was added successfully - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for float value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) - return false; - - // otherwise, convert value to string - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} +// \return \c true if the \b new tag was added successfully +// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +//*/ /*! \fn bool BamAlignment::BuildCharData(void) \brief Populates alignment string fields (read name, bases, qualities, tag data). @@ -504,9 +220,11 @@ bool BamAlignment::BuildCharData(void) { switch (op.Type) { - // for 'M', 'I' - write bases - case (Constants::BAM_CIGAR_MATCH_CHAR) : - case (Constants::BAM_CIGAR_INS_CHAR) : + // for 'M', 'I', '=', 'X' - write bases + case (Constants::BAM_CIGAR_MATCH_CHAR) : + case (Constants::BAM_CIGAR_INS_CHAR) : + case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : + case (Constants::BAM_CIGAR_MISMATCH_CHAR) : AlignedBases.append(QueryBases.substr(k, op.Length)); // fall through @@ -537,7 +255,8 @@ bool BamAlignment::BuildCharData(void) { // shouldn't get here default: - fprintf(stderr, "BamAlignment ERROR: invalid CIGAR operation type: %c\n", op.Type); + cerr << "BamAlignment ERROR: invalid CIGAR operation type: " + << op.Type << endl; exit(1); } } @@ -559,6 +278,7 @@ bool BamAlignment::BuildCharData(void) { case(Constants::BAM_TAG_TYPE_ASCII) : case(Constants::BAM_TAG_TYPE_INT8) : case(Constants::BAM_TAG_TYPE_UINT8) : + // no endian swapping necessary for single-byte data ++i; break; @@ -578,14 +298,59 @@ bool BamAlignment::BuildCharData(void) { case(Constants::BAM_TAG_TYPE_HEX) : case(Constants::BAM_TAG_TYPE_STRING) : // no endian swapping necessary for hex-string/string data - while (tagData[i]) { ++i; } + while ( tagData[i] ) + ++i; // increment one more for null terminator ++i; break; + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + // error case + cerr << "BamAlignment ERROR: unknown binary array type encountered: " + << arrayType << endl; + return false; + } + } + + break; + } + // shouldn't get here default : - fprintf(stderr, "BamAlignment ERROR: invalid tag value type: %c\n", type); + cerr << "BamAlignment ERROR: invalid tag value type: " + << type << endl; exit(1); } } @@ -603,242 +368,74 @@ bool BamAlignment::BuildCharData(void) { return true; } -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) - \brief Edits a BAM tag field containing string data. +///*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) +// \brief Edits a BAM tag field containing string data. - If \a tag does not exist, a new entry is created. +// If \a tag does not exist, a new entry is created. - \param tag 2-character tag name - \param type 1-character tag type (must be "Z" or "H") - \param value string data to store +// \param tag 2-character tag name +// \param type 1-character tag type (must be "Z" or "H") +// \param value string data to store - \return \c true if the tag was modified/created successfully +// \return \c true if the tag was modified/created successfully - \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; +// \sa BamAlignment::RemoveTag() +// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +//*/ - // validate tag/type size & that type is OK for string value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && - type.at(0) != Constants::BAM_TAG_TYPE_HEX ) - return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + value.size()]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - const unsigned int dataLength = strlen(value.c_str()); - memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) - return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) - \brief Edits a BAM tag field containing unsigned integer data. - - If \a tag does not exist, a new entry is created. +///*! \fn bool EditTag(const std::string& tag, const std::vector& values); +// \brief Edits a BAM tag field containing a numeric array. - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") - \param value unsigned integer data to store +// If \a tag does not exist, a new entry is created. - \return \c true if the tag was modified/created successfully +// \param tag 2-character tag name +// \param value vector of uint8_t values to store - \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; - - // validate tag/type size & that type is OK for uint32_t value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || - type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) - return false; +// \return \c true if the tag was modified/created successfully +// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +//*/ - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(uint32_t)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) - return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(uint32_t); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} +/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) + \internal -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) - \brief Edits a BAM tag field containing signed integer data. + Searches for requested tag in BAM tag data. - If \a tag does not exist, a new entry is created. + \param tag requested 2-character tag name + \param pTagData pointer to current position in BamAlignment::TagData + \param tagDataLength length of BamAlignment::TagData + \param numBytesParsed number of bytes parsed so far - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") - \param value signed integer data to store + \return \c true if found - \return \c true if the tag was modified/created successfully + \post If \a tag is found, \a pTagData will point to the byte where the tag data begins. + \a numBytesParsed will correspond to the position in the full TagData string. - \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. */ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { - return EditTag(tag, type, (const uint32_t&)value); -} - -/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) - \brief Edits a BAM tag field containing floating-point data. - - If \a tag does not exist, a new entry is created. - - \param tag 2-character tag name - \param type 1-character tag type (must NOT be "Z" or "H") - \param value float data to store - - \return \c true if the tag was modified/created successfully +bool BamAlignment::FindTag(const std::string& tag, + char*& pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) +{ - \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. -*/ -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { - - // skip if core data not parsed - if ( SupportData.HasCoreOnly ) return false; + while ( numBytesParsed < tagDataLength ) { - // validate tag/type size & that type is OK for float value - if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) - return false; + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + // check the current tag, return true on match + if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; + + // get the storage class and find the next tag + if ( *pTagStorageType == '\0' ) return false; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; + if ( *pTagData == '\0' ) return false; } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); + + // checked all tags, none match + return false; } /*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const @@ -853,6 +450,8 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons \return \c true if found */ + +// TODO : REMOVE THIS METHOD bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { return GetTag("NM", (uint32_t&)editDistance); } @@ -906,51 +505,40 @@ int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { \return \c true if found */ + +// TODO : REMOVE THIS METHOD bool BamAlignment::GetReadGroup(std::string& readGroup) const { return GetTag("RG", readGroup); } -/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const - \brief Retrieves the string value associated with a BAM tag. +///*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const +// \brief Retrieves the string value associated with a BAM tag. - \param tag 2-character tag name - \param destination destination for retrieved value +// \param tag 2-character tag name +// \param destination destination for retrieved value - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { +// \return \c true if found +//*/ - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; +///*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const +// \brief Retrieves the numeric array data associated with a BAM tag - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - const unsigned int dataLength = strlen(pTagData); - destination.clear(); - destination.resize(dataLength); - memcpy( (char*)destination.data(), pTagData, dataLength ); - return true; - } - - // tag not found, return failure - return false; -} +// \param tag 2-character tag name +// \param destination destination for retrieved data -/*! \fn bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const - \brief Retrieves the unsigned integer value associated with a BAM tag. +// \return \c true if found +//*/ - \param tag 2-character tag name - \param destination destination for retrieved value +/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const + \brief Retrieves the BAM tag type-code associated with requested tag name. + + \param tag 2-character tag name + \param type destination for the retrieved (1-character) tag type \return \c true if found + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ -bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { +bool BamAlignment::GetTagType(const std::string& tag, char& type) const { // make sure tag data exists if ( SupportData.HasCoreOnly || TagData.empty() ) @@ -961,188 +549,52 @@ bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - // if tag found, determine data byte-length, store data in readGroup, return success - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - - // 1 byte data - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - destinationLength = 1; - break; - - // 2 byte data - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - destinationLength = 2; - break; - - // 4 byte data - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - destinationLength = 4; - break; - - // unsupported type for integer destination (float or var-length strings) - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -/*! \fn bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const - \brief Retrieves the signed integer value associated with a BAM tag. - - \param tag 2-character tag name - \param destination destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { - return GetTag(tag, (uint32_t&)destination); -} - -/*! \fn bool BamAlignment::GetTag(const std::string& tag, float& destination) const - \brief Retrieves the floating-point value associated with a BAM tag. - - \param tag 2-character tag name - \param destination destination for retrieved value - - \return \c true if found -*/ -bool BamAlignment::GetTag(const std::string& tag, float& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) + // if tag not found, return failure + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - - // 1 byte data - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - destinationLength = 1; - break; - - // 2 byte data - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - destinationLength = 2; - break; - - // 4 byte data - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - destinationLength = 4; - break; - - // unsupported type (var-length strings) - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in float destination\n", type); - return false; - - // unknown tag type - default: - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0.0; - memcpy(&destination, pTagData, destinationLength); - return true; + // otherwise, retrieve & validate tag type code + type = *(pTagData - 1); + switch ( type ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + return true; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown tag type encountered: " + << type << endl; + return false; } - - // tag not found, return failure - return false; } -/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const - \brief Retrieves the BAM tag type-code associated with requested tag name. - - \param tag 2-character tag name - \param type destination for the retrieved (1-character) tag type - - \return \c true if found - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. +/*! \fn bool BamAlignment::HasTag(const std::string& tag) const + \brief Returns true if alignment has a record for requested tag. + \param tag 2-character tag name + \return \c true if alignment has a record for tag */ -bool BamAlignment::GetTagType(const std::string& tag, char& type) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) +bool BamAlignment::HasTag(const std::string& tag) const { + + // return false if no tag data present + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; - // localize the tag data + // localize the tag data for lookup char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - - // lookup tag - if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // retrieve tag type code - type = *(pTagData - 1); - - // validate that type is a proper BAM tag type - switch (type) { - case (Constants::BAM_TAG_TYPE_ASCII) : - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_STRING) : - case (Constants::BAM_TAG_TYPE_HEX) : - return true; - - // unknown tag type - default: - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); - return false; - } - } - - // tag not found, return failure - return false; + + // if result of tag lookup + return FindTag(tag, pTagData, tagDataLength, numBytesParsed); } /*! \fn bool BamAlignment::IsDuplicate(void) const @@ -1222,6 +674,23 @@ bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 ); } +/*! \fn bool BamAlignment::IsValidSize(const string& tag, const string& type) const + \internal + + Checks that tag name & type strings are expected sizes. + \a tag should have length + \a type should have length 1 + + \param tag BAM tag name + \param type BAM tag type-code + + \return \c true if both \a tag and \a type are correct sizes +*/ +bool BamAlignment::IsValidSize(const string& tag, const string& type) { + return (tag.size() == Constants::BAM_TAG_TAGSIZE) && + (type.size() == Constants::BAM_TAG_TYPESIZE); +} + /*! \fn bool BamAlignment::RemoveTag(const std::string& tag) \brief Removes field from BAM tags. @@ -1229,9 +698,12 @@ bool BamAlignment::IsSecondMate(void) const { */ bool BamAlignment::RemoveTag(const std::string& tag) { - // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed - // also, return false if no data present to remove - if ( SupportData.HasCoreOnly || TagData.empty() ) + // if char data not populated, do that first + if ( SupportData.HasCoreOnly ) + BuildCharData(); + + // skip if no tags available + if ( TagData.empty() ) return false; // localize the tag data @@ -1240,38 +712,39 @@ bool BamAlignment::RemoveTag(const std::string& tag) { const unsigned int originalTagDataLength = TagData.size(); unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - char newTagData[originalTagDataLength]; - - // copy original tag data up til desired tag - pTagData -= 3; - numBytesParsed -= 3; - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) - return true; - - // copy everything from current tag (the next one after tag for removal) to end + + // if tag not found, simply return true + if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) + return true; + + // otherwise, remove it + char* newTagData = new char[originalTagDataLength]; + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // attemp to skip to next tag + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) { + + // squeeze remaining tag data const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); - - // save new tag data + + // save modified tag data in alignment TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); - return true; } - - // tag not found, no removal - return failure - return false; + + // clean up & return success + delete[] newTagData; + return true; } /*! \fn void BamAlignment::SetIsDuplicate(bool ok) @@ -1388,3 +861,105 @@ void BamAlignment::SetIsSecondMate(bool ok) { void BamAlignment::SetIsUnmapped(bool ok) { SetIsMapped(!ok); } + +/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) + \internal + + Moves to next available tag in tag data string + + \param storageType BAM tag type-code that determines how far to move cursor + \param pTagData pointer to current position (cursor) in tag string + \param numBytesParsed report of how many bytes were parsed (cumulatively) + + \return \c if storageType was a recognized BAM tag type + \post \a pTagData will point to the byte where the next tag data begins. + \a numBytesParsed will correspond to the cursor's position in the full TagData string. +*/ +bool BamAlignment::SkipToNextTag(const char storageType, + char*& pTagData, + unsigned int& numBytesParsed) +{ + switch (storageType) { + + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + numBytesParsed += sizeof(uint16_t); + pTagData += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + while( *pTagData ) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = *pTagData; + ++numBytesParsed; + ++pTagData; + + // read number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped if necessary + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + + // calculate number of bytes to skip + int bytesToSkip = 0; + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + bytesToSkip = numElements; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + bytesToSkip = numElements*sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + bytesToSkip = numElements*sizeof(uint32_t); + break; + default: + cerr << "BamAlignment ERROR: unknown binary array type encountered: " + << arrayType << endl; + return false; + } + + // skip binary array contents + numBytesParsed += bytesToSkip; + pTagData += bytesToSkip; + break; + } + + default: + cerr << "BamAlignment ERROR: unknown tag type encountered" + << storageType << endl; + return false; + } + + // return success + return true; +}