// ***************************************************************************
// BamAlignment.cpp (c) 2009 Derek Barnett
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 21 March 2011 (DB)
+// Last modified: 4 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides the BamAlignment data structure
// ***************************************************************************
#include <cstdlib>
#include <cstring>
#include <exception>
+#include <iostream>
#include <map>
#include <utility>
using namespace std;
-// internal utility methods
-namespace BamTools {
-namespace Internal {
-
-/*! \fn bool IsValidSize(const string& tag, const string& type)
- \internal
-
- Checks that tag name & type strings are expected sizes.
- \a tag should have length
- \a type should have length 1
-
- \param tag BAM tag name
- \param type BAM tag type-code
-
- \return \c true if both \a tag and \a type are correct sizes
-*/
-bool IsValidSize(const string& tag, const string& type) {
- return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
- (type.size() == Constants::BAM_TAG_TYPESIZE);
-}
-
-/*! \fn bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed)
- \internal
-
- Moves to next available tag in tag data string
-
- \param storageType BAM tag type-code that determines how far to move cursor
- \param pTagData pointer to current position (cursor) in tag string
- \param numBytesParsed report of how many bytes were parsed (cumulatively)
-
- \return \c if storageType was a recognized BAM tag type
- \post \a pTagData will point to the byte where the next tag data begins.
- \a numBytesParsed will correspond to the cursor's position in the full TagData string.
-*/
-bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
-
- switch (storageType) {
-
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- ++numBytesParsed;
- ++pTagData;
- break;
-
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- numBytesParsed += 2;
- pTagData += 2;
- break;
-
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- numBytesParsed += 4;
- pTagData += 4;
- break;
-
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- while(*pTagData) {
- ++numBytesParsed;
- ++pTagData;
- }
- // increment for null-terminator
- ++numBytesParsed;
- ++pTagData;
- break;
-
- default:
- // error case
- fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", storageType);
- return false;
- }
-
- // return success
- return true;
-}
-
-/*! \fn bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed)
- \internal
-
- Searches for requested tag in BAM tag data.
-
- \param tag requested 2-character tag name
- \param pTagData pointer to current position in BamAlignment::TagData
- \param tagDataLength length of BamAlignment::TagData
- \param numBytesParsed number of bytes parsed so far
-
- \return \c true if found
-
- \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
- \a numBytesParsed will correspond to the position in the full TagData string.
-
-*/
-bool FindTag(const std::string& tag,
- char* &pTagData,
- const unsigned int& tagDataLength,
- unsigned int& numBytesParsed)
-{
-
- while ( numBytesParsed < tagDataLength ) {
-
- const char* pTagType = pTagData;
- const char* pTagStorageType = pTagData + 2;
- pTagData += 3;
- numBytesParsed += 3;
-
- // check the current tag, return true on match
- if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
- return true;
-
- // get the storage class and find the next tag
- if ( *pTagStorageType == '\0' ) return false;
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
- if ( *pTagData == '\0' ) return false;
- }
-
- // checked all tags, none match
- return false;
-}
-
-} // namespace Internal
-} // namespace BamTools
-
/*! \class BamTools::BamAlignment
\brief The main BAM alignment data structure.
*/
BamAlignment::~BamAlignment(void) { }
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value)
- \brief Adds a field with string data to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param type 1-character tag type (must be "Z" or "H")
- \param value string data to store
-
- \return \c true if the \b new tag was added successfully
-
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for string value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING &&
- type.at(0) != Constants::BAM_TAG_TYPE_HEX )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // otherwise, copy tag data to temp buffer
- string newTag = tag + type + value;
- const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
-
- // append newTag
- strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
+///*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value)
+// \brief Adds a field with string data to the BAM tags.
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value)
- \brief Adds a field with unsigned integer data to the BAM tags.
+// Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+// \param[in] tag 2-character tag name
+// \param[in] type 1-character tag type (must be "Z" or "H")
+// \param[in] value string data to store
+// \return \c true if the \b new tag was added successfully
+// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+//*/
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", or "H")
- \param value unsigned int data to store
- \return \c true if the \b new tag was added successfully
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for uint32_t value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT ||
- type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // otherwise, convert value to string
- union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un;
- un.value = value;
-
- // copy original tag data to temp buffer
- string newTag = tag + type;
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
-
- // append newTag
- strcat(originalTagData + tagDataLength, newTag.data());
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(uint32_t));
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value)
- \brief Adds a field with signed integer data to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+///*! \fn bool AddTag(const std::string& tag, const std::vector<uint8_t>& values);
+// \brief Adds a numeric array field to the BAM tags.
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", or "H")
- \param value signed int data to store
+// Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
- \return \c true if the \b new tag was added successfully
-
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {
- return AddTag(tag, type, (const uint32_t&)value);
-}
-
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value)
- \brief Adds a field with floating-point data to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "Z" or "H")
- \param value float data to store
+// \param tag 2-character tag name
+// \param values vector of uint8_t values to store
- \return \c true if the \b new tag was added successfully
-
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for float value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // otherwise, convert value to string
- union { float value; char valueBuffer[sizeof(float)]; } un;
- un.value = value;
-
- // copy original tag data to temp buffer
- string newTag = tag + type;
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
-
- // append newTag
- strcat(originalTagData + tagDataLength, newTag.data());
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
+// \return \c true if the \b new tag was added successfully
+// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+//*/
/*! \fn bool BamAlignment::BuildCharData(void)
\brief Populates alignment string fields (read name, bases, qualities, tag data).
switch (op.Type) {
- // for 'M', 'I' - write bases
- case (Constants::BAM_CIGAR_MATCH_CHAR) :
- case (Constants::BAM_CIGAR_INS_CHAR) :
+ // for 'M', 'I', '=', 'X' - write bases
+ case (Constants::BAM_CIGAR_MATCH_CHAR) :
+ case (Constants::BAM_CIGAR_INS_CHAR) :
+ case (Constants::BAM_CIGAR_SEQMATCH_CHAR) :
+ case (Constants::BAM_CIGAR_MISMATCH_CHAR) :
AlignedBases.append(QueryBases.substr(k, op.Length));
// fall through
// shouldn't get here
default:
- fprintf(stderr, "BamAlignment ERROR: invalid CIGAR operation type: %c\n", op.Type);
+ cerr << "BamAlignment ERROR: invalid CIGAR operation type: "
+ << op.Type << endl;
exit(1);
}
}
case(Constants::BAM_TAG_TYPE_ASCII) :
case(Constants::BAM_TAG_TYPE_INT8) :
case(Constants::BAM_TAG_TYPE_UINT8) :
+ // no endian swapping necessary for single-byte data
++i;
break;
case(Constants::BAM_TAG_TYPE_HEX) :
case(Constants::BAM_TAG_TYPE_STRING) :
// no endian swapping necessary for hex-string/string data
- while (tagData[i]) { ++i; }
+ while ( tagData[i] )
+ ++i;
// increment one more for null terminator
++i;
break;
+ case(Constants::BAM_TAG_TYPE_ARRAY) :
+
+ {
+ // read array type
+ const char arrayType = tagData[i];
+ ++i;
+
+ // swap endian-ness of number of elements in place, then retrieve for loop
+ BamTools::SwapEndian_32p(&tagData[i]);
+ int32_t numElements;
+ memcpy(&numElements, &tagData[i], sizeof(uint32_t));
+ i += sizeof(uint32_t);
+
+ // swap endian-ness of array elements
+ for ( int j = 0; j < numElements; ++j ) {
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ // no endian-swapping necessary
+ ++i;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+ default:
+ // error case
+ cerr << "BamAlignment ERROR: unknown binary array type encountered: "
+ << arrayType << endl;
+ return false;
+ }
+ }
+
+ break;
+ }
+
// shouldn't get here
default :
- fprintf(stderr, "BamAlignment ERROR: invalid tag value type: %c\n", type);
+ cerr << "BamAlignment ERROR: invalid tag value type: "
+ << type << endl;
exit(1);
}
}
return true;
}
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value)
- \brief Edits a BAM tag field containing string data.
+///*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value)
+// \brief Edits a BAM tag field containing string data.
- If \a tag does not exist, a new entry is created.
+// If \a tag does not exist, a new entry is created.
- \param tag 2-character tag name
- \param type 1-character tag type (must be "Z" or "H")
- \param value string data to store
+// \param tag 2-character tag name
+// \param type 1-character tag type (must be "Z" or "H")
+// \param value string data to store
- \return \c true if the tag was modified/created successfully
+// \return \c true if the tag was modified/created successfully
- \sa BamAlignment::RemoveTag()
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
+// \sa BamAlignment::RemoveTag()
+// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+//*/
- // validate tag/type size & that type is OK for string value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING &&
- type.at(0) != Constants::BAM_TAG_TYPE_HEX )
- return false;
-
- // localize the tag data
- char* pOriginalTagData = (char*)TagData.data();
- char* pTagData = pOriginalTagData;
- const unsigned int originalTagDataLength = TagData.size();
-
- unsigned int newTagDataLength = 0;
- unsigned int numBytesParsed = 0;
-
- // if tag found, store data in readGroup, return success
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- // make sure array is more than big enough
- char newTagData[originalTagDataLength + value.size()];
-
- // copy original tag data up til desired tag
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // copy new VALUE in place of current tag data
- const unsigned int dataLength = strlen(value.c_str());
- memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData - 1;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
- return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
- const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
-
- // ensure null-terminator
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
-
- // save new tag data
- TagData.assign(newTagData, endTagOffset + endTagDataLength);
- return true;
- }
-
- // tag not found, attempt AddTag
- else return AddTag(tag, type, value);
-}
-
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value)
- \brief Edits a BAM tag field containing unsigned integer data.
-
- If \a tag does not exist, a new entry is created.
+///*! \fn bool EditTag(const std::string& tag, const std::vector<uint8_t>& values);
+// \brief Edits a BAM tag field containing a numeric array.
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", or "H")
- \param value unsigned integer data to store
+// If \a tag does not exist, a new entry is created.
- \return \c true if the tag was modified/created successfully
+// \param tag 2-character tag name
+// \param value vector of uint8_t values to store
- \sa BamAlignment::RemoveTag()
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for uint32_t value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT ||
- type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX )
- return false;
+// \return \c true if the tag was modified/created successfully
+// \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+//*/
- // localize the tag data
- char* pOriginalTagData = (char*)TagData.data();
- char* pTagData = pOriginalTagData;
- const unsigned int originalTagDataLength = TagData.size();
-
- unsigned int newTagDataLength = 0;
- unsigned int numBytesParsed = 0;
-
- // if tag found, store data in readGroup, return success
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- // make sure array is more than big enough
- char newTagData[originalTagDataLength + sizeof(value)];
-
- // copy original tag data up til desired tag
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // copy new VALUE in place of current tag data
- union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un;
- un.value = value;
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(uint32_t));
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData - 1;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
- return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(uint32_t);
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
-
- // ensure null-terminator
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
-
- // save new tag data
- TagData.assign(newTagData, endTagOffset + endTagDataLength);
- return true;
- }
-
- // tag not found, attempt AddTag
- else return AddTag(tag, type, value);
-}
+/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed)
+ \internal
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value)
- \brief Edits a BAM tag field containing signed integer data.
+ Searches for requested tag in BAM tag data.
- If \a tag does not exist, a new entry is created.
+ \param tag requested 2-character tag name
+ \param pTagData pointer to current position in BamAlignment::TagData
+ \param tagDataLength length of BamAlignment::TagData
+ \param numBytesParsed number of bytes parsed so far
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", or "H")
- \param value signed integer data to store
+ \return \c true if found
- \return \c true if the tag was modified/created successfully
+ \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
+ \a numBytesParsed will correspond to the position in the full TagData string.
- \sa BamAlignment::RemoveTag()
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {
- return EditTag(tag, type, (const uint32_t&)value);
-}
-
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value)
- \brief Edits a BAM tag field containing floating-point data.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "Z" or "H")
- \param value float data to store
-
- \return \c true if the tag was modified/created successfully
+bool BamAlignment::FindTag(const std::string& tag,
+ char*& pTagData,
+ const unsigned int& tagDataLength,
+ unsigned int& numBytesParsed)
+{
- \sa BamAlignment::RemoveTag()
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
+ while ( numBytesParsed < tagDataLength ) {
- // validate tag/type size & that type is OK for float value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX )
- return false;
+ const char* pTagType = pTagData;
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
- // localize the tag data
- char* pOriginalTagData = (char*)TagData.data();
- char* pTagData = pOriginalTagData;
- const unsigned int originalTagDataLength = TagData.size();
-
- unsigned int newTagDataLength = 0;
- unsigned int numBytesParsed = 0;
-
- // if tag found, store data in readGroup, return success
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- // make sure array is more than big enough
- char newTagData[originalTagDataLength + sizeof(value)];
-
- // copy original tag data up til desired tag
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // copy new VALUE in place of current tag data
- union { float value; char valueBuffer[sizeof(float)]; } un;
- un.value = value;
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData - 1;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
+ // check the current tag, return true on match
+ if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
-
- // ensure null-terminator
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
-
- // save new tag data
- TagData.assign(newTagData, endTagOffset + endTagDataLength);
- return true;
+
+ // get the storage class and find the next tag
+ if ( *pTagStorageType == '\0' ) return false;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
+ if ( *pTagData == '\0' ) return false;
}
-
- // tag not found, attempt AddTag
- else return AddTag(tag, type, value);
+
+ // checked all tags, none match
+ return false;
}
/*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const
\return \c true if found
*/
+
+// TODO : REMOVE THIS METHOD
bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
return GetTag("NM", (uint32_t&)editDistance);
}
\return \c true if found
*/
+
+// TODO : REMOVE THIS METHOD
bool BamAlignment::GetReadGroup(std::string& readGroup) const {
return GetTag("RG", readGroup);
}
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const
- \brief Retrieves the string value associated with a BAM tag.
+///*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const
+// \brief Retrieves the string value associated with a BAM tag.
- \param tag 2-character tag name
- \param destination destination for retrieved value
+// \param tag 2-character tag name
+// \param destination destination for retrieved value
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {
+// \return \c true if found
+//*/
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
+///*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector<uint32_t>& destination) const
+// \brief Retrieves the numeric array data associated with a BAM tag
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag found, store data in readGroup, return success
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
- const unsigned int dataLength = strlen(pTagData);
- destination.clear();
- destination.resize(dataLength);
- memcpy( (char*)destination.data(), pTagData, dataLength );
- return true;
- }
-
- // tag not found, return failure
- return false;
-}
+// \param tag 2-character tag name
+// \param destination destination for retrieved data
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const
- \brief Retrieves the unsigned integer value associated with a BAM tag.
+// \return \c true if found
+//*/
- \param tag 2-character tag name
- \param destination destination for retrieved value
+/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
+ \brief Retrieves the BAM tag type-code associated with requested tag name.
+
+ \param tag 2-character tag name
+ \param type destination for the retrieved (1-character) tag type
\return \c true if found
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
*/
-bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {
+bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
// make sure tag data exists
if ( SupportData.HasCoreOnly || TagData.empty() )
const unsigned int tagDataLength = TagData.size();
unsigned int numBytesParsed = 0;
- // if tag found, determine data byte-length, store data in readGroup, return success
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
-
- // determine data byte-length
- const char type = *(pTagData - 1);
- int destinationLength = 0;
- switch (type) {
-
- // 1 byte data
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- destinationLength = 1;
- break;
-
- // 2 byte data
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- destinationLength = 2;
- break;
-
- // 4 byte data
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- destinationLength = 4;
- break;
-
- // unsupported type for integer destination (float or var-length strings)
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in integer destination\n", type);
- return false;
-
- // unknown tag type
- default:
- fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type);
- return false;
- }
-
- // store in destination
- destination = 0;
- memcpy(&destination, pTagData, destinationLength);
- return true;
- }
-
- // tag not found, return failure
- return false;
-}
-
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const
- \brief Retrieves the signed integer value associated with a BAM tag.
-
- \param tag 2-character tag name
- \param destination destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {
- return GetTag(tag, (uint32_t&)destination);
-}
-
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, float& destination) const
- \brief Retrieves the floating-point value associated with a BAM tag.
-
- \param tag 2-character tag name
- \param destination destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, float& destination) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
+ // if tag not found, return failure
+ if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
return false;
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag found, determine data byte-length, store data in readGroup, return success
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
-
- // determine data byte-length
- const char type = *(pTagData - 1);
- int destinationLength = 0;
- switch (type) {
-
- // 1 byte data
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- destinationLength = 1;
- break;
-
- // 2 byte data
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- destinationLength = 2;
- break;
-
- // 4 byte data
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- destinationLength = 4;
- break;
-
- // unsupported type (var-length strings)
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in float destination\n", type);
- return false;
-
- // unknown tag type
- default:
- fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type);
- return false;
- }
-
- // store in destination
- destination = 0.0;
- memcpy(&destination, pTagData, destinationLength);
- return true;
+ // otherwise, retrieve & validate tag type code
+ type = *(pTagData - 1);
+ switch ( type ) {
+ case (Constants::BAM_TAG_TYPE_ASCII) :
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_STRING) :
+ case (Constants::BAM_TAG_TYPE_HEX) :
+ case (Constants::BAM_TAG_TYPE_ARRAY) :
+ return true;
+
+ // unknown tag type
+ default:
+ cerr << "BamAlignment ERROR: unknown tag type encountered: "
+ << type << endl;
+ return false;
}
-
- // tag not found, return failure
- return false;
}
-/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
- \brief Retrieves the BAM tag type-code associated with requested tag name.
-
- \param tag 2-character tag name
- \param type destination for the retrieved (1-character) tag type
-
- \return \c true if found
-
- \sa http://samtools.sourceforge.net/SAM-1.3.pdf
- for more details on reserved tag names, supported tag types, etc.
+/*! \fn bool BamAlignment::HasTag(const std::string& tag) const
+ \brief Returns true if alignment has a record for requested tag.
+ \param tag 2-character tag name
+ \return \c true if alignment has a record for tag
*/
-bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
+bool BamAlignment::HasTag(const std::string& tag) const {
+
+ // return false if no tag data present
+ if ( SupportData.HasCoreOnly || TagData.empty() )
return false;
- // localize the tag data
+ // localize the tag data for lookup
char* pTagData = (char*)TagData.data();
const unsigned int tagDataLength = TagData.size();
unsigned int numBytesParsed = 0;
-
- // lookup tag
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
-
- // retrieve tag type code
- type = *(pTagData - 1);
-
- // validate that type is a proper BAM tag type
- switch (type) {
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- return true;
-
- // unknown tag type
- default:
- fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type);
- return false;
- }
- }
-
- // tag not found, return failure
- return false;
+
+ // if result of tag lookup
+ return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
}
/*! \fn bool BamAlignment::IsDuplicate(void) const
return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
}
+/*! \fn bool BamAlignment::IsValidSize(const string& tag, const string& type) const
+ \internal
+
+ Checks that tag name & type strings are expected sizes.
+ \a tag should have length
+ \a type should have length 1
+
+ \param tag BAM tag name
+ \param type BAM tag type-code
+
+ \return \c true if both \a tag and \a type are correct sizes
+*/
+bool BamAlignment::IsValidSize(const string& tag, const string& type) {
+ return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
+ (type.size() == Constants::BAM_TAG_TYPESIZE);
+}
+
/*! \fn bool BamAlignment::RemoveTag(const std::string& tag)
\brief Removes field from BAM tags.
*/
bool BamAlignment::RemoveTag(const std::string& tag) {
- // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed
- // also, return false if no data present to remove
- if ( SupportData.HasCoreOnly || TagData.empty() )
+ // if char data not populated, do that first
+ if ( SupportData.HasCoreOnly )
+ BuildCharData();
+
+ // skip if no tags available
+ if ( TagData.empty() )
return false;
// localize the tag data
const unsigned int originalTagDataLength = TagData.size();
unsigned int newTagDataLength = 0;
unsigned int numBytesParsed = 0;
-
- // if tag found, store data in readGroup, return success
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- char newTagData[originalTagDataLength];
-
- // copy original tag data up til desired tag
- pTagData -= 3;
- numBytesParsed -= 3;
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData + 2;
- pTagData += 3;
- numBytesParsed += 3;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
- return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
+
+ // if tag not found, simply return true
+ if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
+ return true;
+
+ // otherwise, remove it
+ char* newTagData = new char[originalTagDataLength];
+
+ // copy original tag data up til desired tag
+ pTagData -= 3;
+ numBytesParsed -= 3;
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // attemp to skip to next tag
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+ if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
+
+ // squeeze remaining tag data
const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
-
- // save new tag data
+
+ // save modified tag data in alignment
TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
- return true;
}
-
- // tag not found, no removal - return failure
- return false;
+
+ // clean up & return success
+ delete[] newTagData;
+ return true;
}
/*! \fn void BamAlignment::SetIsDuplicate(bool ok)
void BamAlignment::SetIsUnmapped(bool ok) {
SetIsMapped(!ok);
}
+
+/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed)
+ \internal
+
+ Moves to next available tag in tag data string
+
+ \param storageType BAM tag type-code that determines how far to move cursor
+ \param pTagData pointer to current position (cursor) in tag string
+ \param numBytesParsed report of how many bytes were parsed (cumulatively)
+
+ \return \c if storageType was a recognized BAM tag type
+ \post \a pTagData will point to the byte where the next tag data begins.
+ \a numBytesParsed will correspond to the cursor's position in the full TagData string.
+*/
+bool BamAlignment::SkipToNextTag(const char storageType,
+ char*& pTagData,
+ unsigned int& numBytesParsed)
+{
+ switch (storageType) {
+
+ case (Constants::BAM_TAG_TYPE_ASCII) :
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ numBytesParsed += sizeof(uint16_t);
+ pTagData += sizeof(uint16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ numBytesParsed += sizeof(uint32_t);
+ pTagData += sizeof(uint32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_STRING) :
+ case (Constants::BAM_TAG_TYPE_HEX) :
+ while( *pTagData ) {
+ ++numBytesParsed;
+ ++pTagData;
+ }
+ // increment for null-terminator
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_ARRAY) :
+
+ {
+ // read array type
+ const char arrayType = *pTagData;
+ ++numBytesParsed;
+ ++pTagData;
+
+ // read number of elements
+ int32_t numElements;
+ memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped if necessary
+ numBytesParsed += sizeof(uint32_t);
+ pTagData += sizeof(uint32_t);
+
+ // calculate number of bytes to skip
+ int bytesToSkip = 0;
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ bytesToSkip = numElements;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ bytesToSkip = numElements*sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ bytesToSkip = numElements*sizeof(uint32_t);
+ break;
+ default:
+ cerr << "BamAlignment ERROR: unknown binary array type encountered: "
+ << arrayType << endl;
+ return false;
+ }
+
+ // skip binary array contents
+ numBytesParsed += bytesToSkip;
+ pTagData += bytesToSkip;
+ break;
+ }
+
+ default:
+ cerr << "BamAlignment ERROR: unknown tag type encountered"
+ << storageType << endl;
+ return false;
+ }
+
+ // return success
+ return true;
+}