// ***************************************************************************
// BamAlignment.cpp (c) 2009 Derek Barnett
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 19 April 2011 (DB)
+// Last modified: 13 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides the BamAlignment data structure
// ***************************************************************************
-#include <api/BamAlignment.h>
-#include <api/BamConstants.h>
+#include "api/BamAlignment.h"
+#include "api/BamConstants.h"
using namespace BamTools;
-
-#include <cctype>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <exception>
-#include <iostream>
-#include <map>
-#include <utility>
using namespace std;
-// internal utility methods
-namespace BamTools {
-namespace Internal {
-
-/*! \fn bool IsValidSize(const string& tag, const string& type)
- \internal
-
- Checks that tag name & type strings are expected sizes.
- \a tag should have length
- \a type should have length 1
-
- \param tag BAM tag name
- \param type BAM tag type-code
-
- \return \c true if both \a tag and \a type are correct sizes
-*/
-bool IsValidSize(const string& tag, const string& type) {
- return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
- (type.size() == Constants::BAM_TAG_TYPESIZE);
-}
-
-/*! \fn bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed)
- \internal
-
- Moves to next available tag in tag data string
-
- \param storageType BAM tag type-code that determines how far to move cursor
- \param pTagData pointer to current position (cursor) in tag string
- \param numBytesParsed report of how many bytes were parsed (cumulatively)
-
- \return \c if storageType was a recognized BAM tag type
- \post \a pTagData will point to the byte where the next tag data begins.
- \a numBytesParsed will correspond to the cursor's position in the full TagData string.
-*/
-bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
-
- switch (storageType) {
-
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- ++numBytesParsed;
- ++pTagData;
- break;
-
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- numBytesParsed += sizeof(uint16_t);
- pTagData += sizeof(uint16_t);
- break;
-
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- numBytesParsed += sizeof(uint32_t);
- pTagData += sizeof(uint32_t);
- break;
-
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- while( *pTagData ) {
- ++numBytesParsed;
- ++pTagData;
- }
- // increment for null-terminator
- ++numBytesParsed;
- ++pTagData;
- break;
-
- case (Constants::BAM_TAG_TYPE_ARRAY) :
-
- {
- // read array type
- const char arrayType = *pTagData;
- ++numBytesParsed;
- ++pTagData;
-
- // read number of elements
- int32_t numElements;
- memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped if necessary
- numBytesParsed += sizeof(uint32_t);
- pTagData += sizeof(uint32_t);
-
- // calculate number of bytes to skip
- int bytesToSkip = 0;
- switch (arrayType) {
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- bytesToSkip = numElements;
- break;
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- bytesToSkip = numElements*sizeof(uint16_t);
- break;
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- bytesToSkip = numElements*sizeof(uint32_t);
- break;
- default:
- cerr << "BamAlignment ERROR: unknown binary array type encountered: "
- << arrayType << endl;
- return false;
- }
-
- // skip binary array contents
- numBytesParsed += bytesToSkip;
- pTagData += bytesToSkip;
- break;
- }
-
- default:
- cerr << "BamAlignment ERROR: unknown tag type encountered"
- << storageType << endl;
- return false;
- }
-
- // return success
- return true;
-}
-
-/*! \fn bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed)
- \internal
-
- Searches for requested tag in BAM tag data.
-
- \param tag requested 2-character tag name
- \param pTagData pointer to current position in BamAlignment::TagData
- \param tagDataLength length of BamAlignment::TagData
- \param numBytesParsed number of bytes parsed so far
-
- \return \c true if found
-
- \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
- \a numBytesParsed will correspond to the position in the full TagData string.
-
-*/
-bool FindTag(const std::string& tag,
- char* &pTagData,
- const unsigned int& tagDataLength,
- unsigned int& numBytesParsed)
-{
-
- while ( numBytesParsed < tagDataLength ) {
-
- const char* pTagType = pTagData;
- const char* pTagStorageType = pTagData + 2;
- pTagData += 3;
- numBytesParsed += 3;
-
- // check the current tag, return true on match
- if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
- return true;
-
- // get the storage class and find the next tag
- if ( *pTagStorageType == '\0' ) return false;
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
- if ( *pTagData == '\0' ) return false;
- }
-
- // checked all tags, none match
- return false;
-}
-
-} // namespace Internal
-} // namespace BamTools
-
/*! \class BamTools::BamAlignment
\brief The main BAM alignment data structure.
*/
BamAlignment::~BamAlignment(void) { }
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value)
- \brief Adds a field with string data to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param type 1-character tag type (must be "Z" or "H")
- \param value string data to store
-
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for string value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING &&
- type.at(0) != Constants::BAM_TAG_TYPE_HEX
- )
- {
- return false;
- }
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // otherwise, copy tag data to temp buffer
- string newTag = tag + type + value;
- const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
-
- // append newTag
- strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value)
- \brief Adds a field with unsigned integer data to the BAM tags.
+/*! \fn bool BamAlignment::BuildCharData(void)
+ \brief Populates alignment string fields (read name, bases, qualities, tag data).
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+ An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
+ Using that method makes parsing much quicker when only positional data is required.
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", "H", or "B")
- \param value unsigned int data to store
+ However, if you later want to access the character data fields from such an alignment,
+ use this method to populate those fields. Provides ability to do 'lazy evaluation' of
+ alignment parsing.
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+ \return \c true if character data populated successfully (or was already available to begin with)
*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for uint32_t value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT ||
- type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX ||
- type.at(0) == Constants::BAM_TAG_TYPE_ARRAY
- )
- {
- return false;
- }
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // otherwise, convert value to string
- union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un;
- un.value = value;
-
- // copy original tag data to temp buffer
- string newTag = tag + type;
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
-
- // append newTag
- strcat(originalTagData + tagDataLength, newTag.data());
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(uint32_t));
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value)
- \brief Adds a field with signed integer data to the BAM tags.
+bool BamAlignment::BuildCharData(void) {
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+ // skip if char data already parsed
+ if ( !SupportData.HasCoreOnly )
+ return true;
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", "H", or "B")
- \param value signed int data to store
+ // check system endianness
+ bool IsBigEndian = BamTools::SystemIsBigEndian();
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {
- return AddTag(tag, type, (const uint32_t&)value);
-}
+ // calculate character lengths/offsets
+ const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
+ const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4);
+ const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
+ const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
+ const unsigned int tagDataLength = dataLength - tagDataOffset;
-/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value)
- \brief Adds a field with floating-point data to the BAM tags.
+ // check offsets to see what char data exists
+ const bool hasSeqData = ( seqDataOffset < dataLength );
+ const bool hasQualData = ( qualDataOffset < dataLength );
+ const bool hasTagData = ( tagDataOffset < dataLength );
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+ // set up char buffers
+ const char* allCharData = SupportData.AllCharData.data();
+ const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
+ const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
+ char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "Z", "H", or "B")
- \param value float data to store
+ // store alignment name (relies on null char in name as terminator)
+ Name.assign((const char*)(allCharData));
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for float value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX ||
- type.at(0) == Constants::BAM_TAG_TYPE_ARRAY
- )
- {
- return false;
+ // save query sequence
+ QueryBases.clear();
+ if ( hasSeqData ) {
+ QueryBases.reserve(SupportData.QuerySequenceLength);
+ for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
+ const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+ QueryBases.append(1, singleBase);
+ }
}
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // otherwise, convert value to string
- union { float value; char valueBuffer[sizeof(float)]; } un;
- un.value = value;
-
- // copy original tag data to temp buffer
- string newTag = tag + type;
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
-
- // append newTag
- strcat(originalTagData + tagDataLength, newTag.data());
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool AddTag(const std::string& tag, const std::vector<uint8_t>& values);
- \brief Adds a numeric array field to the BAM tags.
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
+ Qualities.clear();
+ if ( hasQualData ) {
+ Qualities.reserve(SupportData.QuerySequenceLength);
+ for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) {
+ const char singleQuality = static_cast<const char>(qualData[i]+33);
+ Qualities.append(1, singleQuality);
+ }
+ }
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+ // clear previous AlignedBases
+ AlignedBases.clear();
- \param tag 2-character tag name
- \param values vector of uint8_t values to store
+ // if QueryBases has data, build AlignedBases using CIGAR data
+ // otherwise, AlignedBases will remain empty (this case IS allowed)
+ if ( !QueryBases.empty() ) {
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::vector<uint8_t>& values) {
+ // resize AlignedBases
+ AlignedBases.reserve(SupportData.QuerySequenceLength);
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
+ // iterate over CigarOps
+ int k = 0;
+ vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+ const CigarOp& op = (*cigarIter);
- // check for valid tag length
- if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // build new tag's base information
- char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
- memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
- newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
- newTagBase[3] = Constants::BAM_TAG_TYPE_UINT8;
-
- // add number of array elements to newTagBase
- const int32_t numElements = values.size();
- memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
-
- // copy current TagData string to temp buffer, leaving room for new tag's contents
- const int newTagDataLength = tagDataLength +
- Constants::BAM_TAG_ARRAYBASE_SIZE +
- numElements*sizeof(uint8_t);
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
-
- // write newTagBase (removes old null term)
- strcat(originalTagData + tagDataLength, (const char*)newTagBase);
-
- // add vector elements to tag
- int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
- for ( int i = 0 ; i < numElements; ++i ) {
- const uint8_t value = values.at(i);
- memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint8_t),
- &value, sizeof(uint8_t));
- }
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool AddTag(const std::string& tag, const std::vector<int8_t>& values);
- \brief Adds a numeric array field to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param values vector of int8_t values to store
-
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::vector<int8_t>& values) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // check for valid tag length
- if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // build new tag's base information
- char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
- memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
- newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
- newTagBase[3] = Constants::BAM_TAG_TYPE_INT8;
-
- // add number of array elements to newTagBase
- const int32_t numElements = values.size();
- memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
-
- // copy current TagData string to temp buffer, leaving room for new tag's contents
- const int newTagDataLength = tagDataLength +
- Constants::BAM_TAG_ARRAYBASE_SIZE +
- numElements*sizeof(int8_t);
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
-
- // write newTagBase (removes old null term)
- strcat(originalTagData + tagDataLength, (const char*)newTagBase);
-
- // add vector elements to tag
- int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
- for ( int i = 0 ; i < numElements; ++i ) {
- const int8_t value = values.at(i);
- memcpy(originalTagData + elementsBeginOffset + i*sizeof(int8_t),
- &value, sizeof(int8_t));
- }
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool AddTag(const std::string& tag, const std::vector<uint16_t>& values);
- \brief Adds a numeric array field to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param values vector of uint16_t values to store
-
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::vector<uint16_t>& values) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // check for valid tag length
- if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // build new tag's base information
- char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
- memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
- newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
- newTagBase[3] = Constants::BAM_TAG_TYPE_UINT16;
-
- // add number of array elements to newTagBase
- const int32_t numElements = values.size();
- memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
-
- // copy current TagData string to temp buffer, leaving room for new tag's contents
- const int newTagDataLength = tagDataLength +
- Constants::BAM_TAG_ARRAYBASE_SIZE +
- numElements*sizeof(uint16_t);
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
-
- // write newTagBase (removes old null term)
- strcat(originalTagData + tagDataLength, (const char*)newTagBase);
-
- // add vector elements to tag
- int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
- for ( int i = 0 ; i < numElements; ++i ) {
- const uint16_t value = values.at(i);
- memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint16_t),
- &value, sizeof(uint16_t));
- }
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool AddTag(const std::string& tag, const std::vector<int16_t>& values);
- \brief Adds a numeric array field to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param values vector of int16_t values to store
-
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::vector<int16_t>& values) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // check for valid tag length
- if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // build new tag's base information
- char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
- memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
- newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
- newTagBase[3] = Constants::BAM_TAG_TYPE_INT16;
-
- // add number of array elements to newTagBase
- const int32_t numElements = values.size();
- memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
-
- // copy current TagData string to temp buffer, leaving room for new tag's contents
- const int newTagDataLength = tagDataLength +
- Constants::BAM_TAG_ARRAYBASE_SIZE +
- numElements*sizeof(int16_t);
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
-
- // write newTagBase (removes old null term)
- strcat(originalTagData + tagDataLength, (const char*)newTagBase);
-
- // add vector elements to tag
- int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
- for ( int i = 0 ; i < numElements; ++i ) {
- const int16_t value = values.at(i);
- memcpy(originalTagData + elementsBeginOffset + i*sizeof(int16_t),
- &value, sizeof(int16_t));
- }
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool AddTag(const std::string& tag, const std::vector<uint32_t>& values);
- \brief Adds a numeric array field to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param values vector of uint32_t values to store
-
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::vector<uint32_t>& values) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // check for valid tag length
- if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // build new tag's base information
- char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
- memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
- newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
- newTagBase[3] = Constants::BAM_TAG_TYPE_UINT32;
-
- // add number of array elements to newTagBase
- const int32_t numElements = values.size();
- memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
-
- // copy current TagData string to temp buffer, leaving room for new tag's contents
- const int newTagDataLength = tagDataLength +
- Constants::BAM_TAG_ARRAYBASE_SIZE +
- numElements*sizeof(uint32_t);
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
-
- // write newTagBase (removes old null term)
- strcat(originalTagData + tagDataLength, (const char*)newTagBase);
-
- // add vector elements to tag
- int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
- for ( int i = 0 ; i < numElements; ++i ) {
- const uint32_t value = values.at(i);
- memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint32_t),
- &value, sizeof(uint32_t));
- }
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool AddTag(const std::string& tag, const std::vector<int32_t>& values);
- \brief Adds a numeric array field to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param values vector of int32_t values to store
-
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::vector<int32_t>& values) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // check for valid tag length
- if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // build new tag's base information
- char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
- memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
- newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
- newTagBase[3] = Constants::BAM_TAG_TYPE_INT32;
-
- // add number of array elements to newTagBase
- const int32_t numElements = values.size();
- memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
-
- // copy current TagData string to temp buffer, leaving room for new tag's contents
- const int newTagDataLength = tagDataLength +
- Constants::BAM_TAG_ARRAYBASE_SIZE +
- numElements*sizeof(int32_t);
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
-
- // write newTagBase (removes old null term)
- strcat(originalTagData + tagDataLength, (const char*)newTagBase);
-
- // add vector elements to tag
- int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
- for ( int i = 0 ; i < numElements; ++i ) {
- const int32_t value = values.at(i);
- memcpy(originalTagData + elementsBeginOffset + i*sizeof(int32_t),
- &value, sizeof(int32_t));
- }
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool AddTag(const std::string& tag, const std::vector<float>& values);
- \brief Adds a numeric array field to the BAM tags.
-
- Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
-
- \param tag 2-character tag name
- \param values vector of float values to store
-
- \return \c true if the \b new tag was added successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::AddTag(const std::string& tag, const std::vector<float>& values) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // check for valid tag length
- if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag already exists, return false
- // use EditTag explicitly instead
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // build new tag's base information
- char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
- memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
- newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
- newTagBase[3] = Constants::BAM_TAG_TYPE_FLOAT;
-
- // add number of array elements to newTagBase
- const int32_t numElements = values.size();
- memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
-
- // copy current TagData string to temp buffer, leaving room for new tag's contents
- const int newTagDataLength = tagDataLength +
- Constants::BAM_TAG_ARRAYBASE_SIZE +
- numElements*sizeof(float);
- char originalTagData[newTagDataLength];
- memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
-
- // write newTagBase (removes old null term)
- strcat(originalTagData + tagDataLength, (const char*)newTagBase);
-
- // add vector elements to tag
- int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
- for ( int i = 0 ; i < numElements; ++i ) {
- const float value = values.at(i);
- memcpy(originalTagData + elementsBeginOffset + i*sizeof(float),
- &value, sizeof(float));
- }
-
- // store temp buffer back in TagData
- const char* newTagData = (const char*)originalTagData;
- TagData.assign(newTagData, newTagDataLength);
-
- // return success
- return true;
-}
-
-/*! \fn bool BamAlignment::BuildCharData(void)
- \brief Populates alignment string fields (read name, bases, qualities, tag data).
-
- An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
- Using that method makes parsing much quicker when only positional data is required.
-
- However, if you later want to access the character data fields from such an alignment,
- use this method to populate those fields. Provides ability to do 'lazy evaluation' of
- alignment parsing.
-
- \return \c true if character data populated successfully (or was already available to begin with)
-*/
-bool BamAlignment::BuildCharData(void) {
-
- // skip if char data already parsed
- if ( !SupportData.HasCoreOnly )
- return true;
-
- // check system endianness
- bool IsBigEndian = BamTools::SystemIsBigEndian();
-
- // calculate character lengths/offsets
- const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
- const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations * 4);
- const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2;
- const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
- const unsigned int tagDataLength = dataLength - tagDataOffset;
-
- // check offsets to see what char data exists
- const bool hasSeqData = ( seqDataOffset < dataLength );
- const bool hasQualData = ( qualDataOffset < dataLength );
- const bool hasTagData = ( tagDataOffset < dataLength );
-
- // set up char buffers
- const char* allCharData = SupportData.AllCharData.data();
- const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
- const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
- char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
-
- // store alignment name (relies on null char in name as terminator)
- Name.assign((const char*)(allCharData));
-
- // save query sequence
- QueryBases.clear();
- if ( hasSeqData ) {
- QueryBases.reserve(SupportData.QuerySequenceLength);
- for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) {
- char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
- QueryBases.append(1, singleBase);
- }
- }
-
- // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
- Qualities.clear();
- if ( hasQualData ) {
- Qualities.reserve(SupportData.QuerySequenceLength);
- for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) {
- char singleQuality = (char)(qualData[i]+33);
- Qualities.append(1, singleQuality);
- }
- }
-
- // clear previous AlignedBases
- AlignedBases.clear();
-
- // if QueryBases has data, build AlignedBases using CIGAR data
- // otherwise, AlignedBases will remain empty (this case IS allowed)
- if ( !QueryBases.empty() ) {
-
- // resize AlignedBases
- AlignedBases.reserve(SupportData.QuerySequenceLength);
-
- // iterate over CigarOps
- int k = 0;
- vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
- vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
- for ( ; cigarIter != cigarEnd; ++cigarIter ) {
- const CigarOp& op = (*cigarIter);
-
- switch (op.Type) {
+ switch ( op.Type ) {
// for 'M', 'I', '=', 'X' - write bases
case (Constants::BAM_CIGAR_MATCH_CHAR) :
case (Constants::BAM_CIGAR_HARDCLIP_CHAR) :
break;
- // shouldn't get here
+ // invalid CIGAR op-code
default:
- cerr << "BamAlignment ERROR: invalid CIGAR operation type: "
- << op.Type << endl;
- exit(1);
+ const string message = string("invalid CIGAR operation type: ") + op.Type;
+ SetErrorString("BamAlignment::BuildCharData", message);
+ return false;
}
}
}
TagData.clear();
if ( hasTagData ) {
if ( IsBigEndian ) {
- int i = 0;
- while ( (unsigned int)i < tagDataLength ) {
+ size_t i = 0;
+ while ( i < tagDataLength ) {
i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
const char type = tagData[i]; // get tag type at position i
// swap endian-ness of number of elements in place, then retrieve for loop
BamTools::SwapEndian_32p(&tagData[i]);
- int32_t numElements;
+ uint32_t numElements;
memcpy(&numElements, &tagData[i], sizeof(uint32_t));
i += sizeof(uint32_t);
// swap endian-ness of array elements
- for ( int j = 0; j < numElements; ++j ) {
+ for ( size_t j = 0; j < numElements; ++j ) {
switch (arrayType) {
case (Constants::BAM_TAG_TYPE_INT8) :
case (Constants::BAM_TAG_TYPE_UINT8) :
i += sizeof(uint32_t);
break;
default:
- // error case
- cerr << "BamAlignment ERROR: unknown binary array type encountered: "
- << arrayType << endl;
+ const string message = string("invalid binary array type: ") + arrayType;
+ SetErrorString("BamAlignment::BuildCharData", message);
return false;
}
}
break;
}
- // shouldn't get here
+ // invalid tag type-code
default :
- cerr << "BamAlignment ERROR: invalid tag value type: "
- << type << endl;
- exit(1);
+ const string message = string("invalid tag type: ") + type;
+ SetErrorString("BamAlignment::BuildCharData", message);
+ return false;
}
}
}
// store tagData in alignment
TagData.resize(tagDataLength);
- memcpy((char*)TagData.data(), tagData, tagDataLength);
+ memcpy((char*)(TagData.data()), tagData, tagDataLength);
}
- // clear the core-only flag
+ // clear core-only flag & return success
SupportData.HasCoreOnly = false;
-
- // return success
return true;
}
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value)
- \brief Edits a BAM tag field containing string data.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param type 1-character tag type (must be "Z" or "H")
- \param value string data to store
-
- \return \c true if the tag was modified/created successfully
-
- \sa BamAlignment::RemoveTag()
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for string value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING &&
- type.at(0) != Constants::BAM_TAG_TYPE_HEX )
- return false;
-
- // localize the tag data
- char* pOriginalTagData = (char*)TagData.data();
- char* pTagData = pOriginalTagData;
- const unsigned int originalTagDataLength = TagData.size();
-
- unsigned int newTagDataLength = 0;
- unsigned int numBytesParsed = 0;
-
- // if tag found
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- // make sure array is more than big enough
- char newTagData[originalTagDataLength + value.size()];
-
- // copy original tag data up til desired tag
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // copy new @value in place of current tag data
- const unsigned int dataLength = strlen(value.c_str());
- memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData - 1;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
- return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
- const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
-
- // ensure null-terminator
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
-
- // save new tag data
- TagData.assign(newTagData, endTagOffset + endTagDataLength);
- return true;
- }
-
- // tag not found, attempt AddTag
- else return AddTag(tag, type, value);
-}
+/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
+ \internal
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value)
- \brief Edits a BAM tag field containing unsigned integer data.
+ Searches for requested tag in BAM tag data.
- If \a tag does not exist, a new entry is created.
+ \param[in] tag requested 2-character tag name
+ \param[in,out] pTagData pointer to current position in BamAlignment::TagData
+ \param[in] tagDataLength length of BamAlignment::TagData
+ \param[in,out] numBytesParsed number of bytes parsed so far
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", "H", or "B")
- \param value unsigned integer data to store
+ \return \c true if found
- \return \c true if the tag was modified/created successfully
+ \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
+ \a numBytesParsed will correspond to the position in the full TagData string.
- \sa BamAlignment::RemoveTag()
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for uint32_t value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT ||
- type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX ||
- type.at(0) == Constants::BAM_TAG_TYPE_ARRAY
- )
- {
- return false;
- }
+bool BamAlignment::FindTag(const std::string& tag,
+ char*& pTagData,
+ const unsigned int& tagDataLength,
+ unsigned int& numBytesParsed) const
+{
- // localize the tag data
- char* pOriginalTagData = (char*)TagData.data();
- char* pTagData = pOriginalTagData;
- const unsigned int originalTagDataLength = TagData.size();
-
- unsigned int newTagDataLength = 0;
- unsigned int numBytesParsed = 0;
-
- // if tag found
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- // make sure array is more than big enough
- char newTagData[originalTagDataLength + sizeof(value)];
-
- // copy original tag data up til desired tag
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // copy new @value in place of current tag data
- union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un;
- un.value = value;
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(uint32_t));
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData - 1;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
- return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(uint32_t);
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
-
- // ensure null-terminator
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
-
- // save new tag data
- TagData.assign(newTagData, endTagOffset + endTagDataLength);
- return true;
- }
-
- // tag not found, attempt AddTag
- else return AddTag(tag, type, value);
-}
-
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value)
- \brief Edits a BAM tag field containing signed integer data.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "f", "Z", "H", or "B")
- \param value signed integer data to store
-
- \return \c true if the tag was modified/created successfully
-
- \sa BamAlignment::RemoveTag()
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {
- return EditTag(tag, type, (const uint32_t&)value);
-}
-
-/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value)
- \brief Edits a BAM tag field containing floating-point data.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param type 1-character tag type (must NOT be "Z", "H", or "B")
- \param value float data to store
-
- \return \c true if the tag was modified/created successfully
-
- \sa BamAlignment::RemoveTag()
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {
-
- // skip if core data not parsed
- if ( SupportData.HasCoreOnly ) return false;
-
- // validate tag/type size & that type is OK for float value
- if ( !Internal::IsValidSize(tag, type) ) return false;
- if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING ||
- type.at(0) == Constants::BAM_TAG_TYPE_HEX ||
- type.at(0) == Constants::BAM_TAG_TYPE_ARRAY
- )
- {
- return false;
- }
-
- // localize the tag data
- char* pOriginalTagData = (char*)TagData.data();
- char* pTagData = pOriginalTagData;
- const unsigned int originalTagDataLength = TagData.size();
-
- unsigned int newTagDataLength = 0;
- unsigned int numBytesParsed = 0;
-
- // if tag found
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- // make sure array is more than big enough
- char newTagData[originalTagDataLength + sizeof(value)];
-
- // copy original tag data up til desired tag
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // copy new @value in place of current tag data
- union { float value; char valueBuffer[sizeof(float)]; } un;
- un.value = value;
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData - 1;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
- return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
-
- // ensure null-terminator
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
-
- // save new tag data
- TagData.assign(newTagData, endTagOffset + endTagDataLength);
- return true;
- }
-
- // tag not found, attempt AddTag
- else return AddTag(tag, type, value);
-}
-
-/*! \fn bool EditTag(const std::string& tag, const std::vector<uint8_t>& values);
- \brief Edits a BAM tag field containing a numeric array.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param value vector of uint8_t values to store
-
- \return \c true if the tag was modified/created successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::vector<uint8_t>& values) {
-
- // can't do anything if TagData not parsed
- if ( SupportData.HasCoreOnly )
- return false;
-
- // remove existing tag if present
- if ( HasTag(tag) )
- RemoveTag(tag);
-
- // add tag record with new values
- return AddTag(tag, values);
-}
-
-/*! \fn bool EditTag(const std::string& tag, const std::vector<int8_t>& values);
- \brief Edits a BAM tag field containing a numeric array.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param value vector of int8_t values to store
-
- \return \c true if the tag was modified/created successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::vector<int8_t>& values) {
-
- // can't do anything if TagData not parsed
- if ( SupportData.HasCoreOnly )
- return false;
-
- // remove existing tag if present
- if ( HasTag(tag) )
- RemoveTag(tag);
-
- // add tag record with new values
- return AddTag(tag, values);
-}
-
-/*! \fn bool EditTag(const std::string& tag, const std::vector<uint16_t>& values);
- \brief Edits a BAM tag field containing a numeric array.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param value vector of uint16_t values to store
-
- \return \c true if the tag was modified/created successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::vector<uint16_t>& values) {
-
- // can't do anything if TagData not parsed
- if ( SupportData.HasCoreOnly )
- return false;
-
- // remove existing tag if present
- if ( HasTag(tag) )
- RemoveTag(tag);
-
- // add tag record with new values
- return AddTag(tag, values);
-}
-
-/*! \fn bool EditTag(const std::string& tag, const std::vector<int16_t>& values);
- \brief Edits a BAM tag field containing a numeric array.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param value vector of int16_t values to store
-
- \return \c true if the tag was modified/created successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::vector<int16_t>& values) {
-
- // can't do anything if TagData not parsed
- if ( SupportData.HasCoreOnly )
- return false;
-
- // remove existing tag if present
- if ( HasTag(tag) )
- RemoveTag(tag);
-
- // add tag record with new values
- return AddTag(tag, values);
-}
-
-/*! \fn bool EditTag(const std::string& tag, const std::vector<uint32_t>& values);
- \brief Edits a BAM tag field containing a numeric array.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param value vector of uint32_t values to store
-
- \return \c true if the tag was modified/created successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::vector<uint32_t>& values) {
-
- // can't do anything if TagData not parsed
- if ( SupportData.HasCoreOnly )
- return false;
-
- // remove existing tag if present
- if ( HasTag(tag) )
- RemoveTag(tag);
-
- // add tag record with new values
- return AddTag(tag, values);
-}
-
-/*! \fn bool EditTag(const std::string& tag, const std::vector<int32_t>& values);
- \brief Edits a BAM tag field containing a numeric array.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param value vector of int32_t values to store
-
- \return \c true if the tag was modified/created successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::vector<int32_t>& values) {
-
- // can't do anything if TagData not parsed
- if ( SupportData.HasCoreOnly )
- return false;
-
- // remove existing tag if present
- if ( HasTag(tag) )
- RemoveTag(tag);
-
- // add tag record with new values
- return AddTag(tag, values);
-}
-
-/*! \fn bool EditTag(const std::string& tag, const std::vector<float>& values);
- \brief Edits a BAM tag field containing a numeric array.
-
- If \a tag does not exist, a new entry is created.
-
- \param tag 2-character tag name
- \param value vector of float values to store
-
- \return \c true if the tag was modified/created successfully
- \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
-*/
-bool BamAlignment::EditTag(const std::string& tag, const std::vector<float>& values) {
-
- // can't do anything if TagData not parsed
- if ( SupportData.HasCoreOnly )
- return false;
-
- // remove existing tag if present
- if ( HasTag(tag) )
- RemoveTag(tag);
-
- // add tag record with new values
- return AddTag(tag, values);
-}
-
-/*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const
- \brief Retrieves value of edit distance tag ("NM").
-
- \deprecated Instead use BamAlignment::GetTag()
- \code
- BamAlignment::GetTag("NM", editDistance);
- \endcode
-
- \param editDistance destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
- return GetTag("NM", (uint32_t&)editDistance);
-}
-
-/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool zeroBased = true) const
- \brief Calculates alignment end position, based on starting position and CIGAR data.
-
- \param usePadded Inserted bases affect reported position. Default is false, so that reported
- position stays 'sync-ed' with reference coordinates.
- \param zeroBased Return (BAM standard) 0-based coordinate. Setting this to false can be useful
- when using BAM data with half-open formats (e.g. BED).
-
- \return alignment end position
-*/
-int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
-
- // initialize alignment end to starting position
- int alignEnd = Position;
-
- // iterate over cigar operations
- vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
- vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
- for ( ; cigarIter != cigarEnd; ++cigarIter) {
- const char cigarType = (*cigarIter).Type;
- const uint32_t& cigarLength = (*cigarIter).Length;
-
- if ( cigarType == Constants::BAM_CIGAR_MATCH_CHAR ||
- cigarType == Constants::BAM_CIGAR_DEL_CHAR ||
- cigarType == Constants::BAM_CIGAR_REFSKIP_CHAR )
- alignEnd += cigarLength;
- else if ( usePadded && cigarType == Constants::BAM_CIGAR_INS_CHAR )
- alignEnd += cigarLength;
- }
-
- // adjust for zero-based coordinates, if requested
- if ( zeroBased ) alignEnd -= 1;
-
- // return result
- return alignEnd;
-}
-
-/*! \fn bool BamAlignment::GetReadGroup(std::string& readGroup) const
- \brief Retrieves value of read group tag ("RG").
-
- \deprecated Instead use BamAlignment::GetTag()
- \code
- BamAlignment::GetTag("RG", readGroup);
- \endcode
-
- \param readGroup destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetReadGroup(std::string& readGroup) const {
- return GetTag("RG", readGroup);
-}
-
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const
- \brief Retrieves the string value associated with a BAM tag.
-
- \param tag 2-character tag name
- \param destination destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag found
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
- const unsigned int dataLength = strlen(pTagData);
- destination.clear();
- destination.resize(dataLength);
- memcpy( (char*)destination.data(), pTagData, dataLength );
- return true;
- }
-
- // tag not found, return failure
- return false;
-}
-
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const
- \brief Retrieves the unsigned integer value associated with a BAM tag.
-
- \param tag 2-character tag name
- \param destination destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag found
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
-
- // determine data byte-length
- const char type = *(pTagData - 1);
- int destinationLength = 0;
- switch (type) {
-
- // 1 byte data
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- destinationLength = 1;
- break;
-
- // 2 byte data
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- destinationLength = 2;
- break;
-
- // 4 byte data
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- destinationLength = 4;
- break;
-
- // unsupported type for integer destination (float or var-length strings)
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- case (Constants::BAM_TAG_TYPE_ARRAY) :
- cerr << "BamAlignment ERROR: cannot store tag of type " << type
- << " in integer destination" << endl;
- return false;
-
- // unknown tag type
- default:
- cerr << "BamAlignment ERROR: unknown tag type encountered: "
- << type << endl;
- return false;
- }
-
- // store in destination
- destination = 0;
- memcpy(&destination, pTagData, destinationLength);
- return true;
- }
-
- // tag not found, return failure
- return false;
-}
-
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const
- \brief Retrieves the signed integer value associated with a BAM tag.
-
- \param tag 2-character tag name
- \param destination destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {
- return GetTag(tag, (uint32_t&)destination);
-}
-
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, float& destination) const
- \brief Retrieves the floating-point value associated with a BAM tag.
-
- \param tag 2-character tag name
- \param destination destination for retrieved value
-
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, float& destination) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // if tag found
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
-
- // determine data byte-length
- const char type = *(pTagData - 1);
- int destinationLength = 0;
- switch (type) {
-
- // 1 byte data
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- destinationLength = 1;
- break;
-
- // 2 byte data
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- destinationLength = 2;
- break;
-
- // 4 byte data
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- destinationLength = 4;
- break;
-
- // unsupported type (var-length strings)
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- case (Constants::BAM_TAG_TYPE_ARRAY) :
- cerr << "BamAlignment ERROR: cannot store tag of type " << type
- << " in float destination" << endl;
- return false;
-
- // unknown tag type
- default:
- cerr << "BamAlignment ERROR: unknown tag type encountered: "
- << type << endl;
- return false;
- }
-
- // store in destination
- destination = 0.0;
- memcpy(&destination, pTagData, destinationLength);
- return true;
- }
-
- // tag not found, return failure
- return false;
-}
-
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector<uint32_t>& destination) const
- \brief Retrieves the numeric array data associated with a BAM tag
-
- \param tag 2-character tag name
- \param destination destination for retrieved data
-
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, std::vector<uint32_t>& destination) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // return false if tag not found
- if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
-
- // check that tag is array type
- const char tagType = *(pTagData - 1);
- if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
- cerr << "BamAlignment ERROR: Cannot store non-array data from tag: "
- << tag << " in array destination" << endl;
- return false;
- }
-
- // calculate length of each element in tag's array
- const char elementType = *pTagData;
- ++pTagData;
- int elementLength = 0;
- switch ( elementType ) {
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- elementLength = sizeof(uint8_t);
- break;
-
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- elementLength = sizeof(uint16_t);
- break;
-
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- elementLength = sizeof(uint32_t);
- break;
+ while ( numBytesParsed < tagDataLength ) {
- // unsupported type for integer destination (float or var-length data)
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- case (Constants::BAM_TAG_TYPE_ARRAY) :
- cerr << "BamAlignment ERROR: array element type: " << elementType
- << " cannot be stored in integer value" << endl;
- return false;
+ const char* pTagType = pTagData;
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
- // unknown tag type
- default:
- cerr << "BamAlignment ERROR: unknown element type encountered: "
- << elementType << endl;
- return false;
- }
+ // check the current tag, return true on match
+ if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
+ return true;
- // get number of elements
- int32_t numElements;
- memcpy(&numElements, pTagData, sizeof(int32_t));
- pTagData += 4;
- destination.clear();
- destination.reserve(numElements);
-
- // read in elements
- uint32_t value;
- for ( int i = 0 ; i < numElements; ++i ) {
- memcpy(&value, pTagData, sizeof(uint32_t));
- pTagData += sizeof(uint32_t);
- destination.push_back(value);
+ // get the storage class and find the next tag
+ if ( *pTagStorageType == '\0' ) return false;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
+ if ( *pTagData == '\0' ) return false;
}
- // return success
+ // checked all tags, none match
return false;
}
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector<int32_t>& destination) const
- \brief Retrieves the numeric array data associated with a BAM tag
-
- \param tag 2-character tag name
- \param destination destination for retrieved data
+/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
+ \brief Calculates alignment end position, based on its starting position and CIGAR data.
- \return \c true if found
-*/
-bool BamAlignment::GetTag(const std::string& tag, std::vector<int32_t>& destination) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
+ \warning The position returned now represents a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED.
- // return false if tag not found
- if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
+ \param[in] usePadded Allow inserted bases to affect the reported position. Default is
+ false, so that reported position stays synced with reference
+ coordinates.
+ \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
+ false, so that his value represents a standard, half-open interval.
- // check that tag is array type
- const char tagType = *(pTagData - 1);
- if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
- cerr << "BamAlignment ERROR: Cannot store non-array data from tag: "
- << tag << " in array destination" << endl;
- return false;
- }
+ \return alignment end position
+*/
+int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
- // calculate length of each element in tag's array
- const char elementType = *pTagData;
- ++pTagData;
- int elementLength = 0;
- switch ( elementType ) {
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- elementLength = sizeof(uint8_t);
- break;
+ // initialize alignment end to starting position
+ int alignEnd = Position;
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- elementLength = sizeof(uint16_t);
- break;
+ // iterate over cigar operations
+ vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- elementLength = sizeof(uint32_t);
- break;
+ switch ( op.Type ) {
- // unsupported type for integer destination (float or var-length data)
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- case (Constants::BAM_TAG_TYPE_ARRAY) :
- cerr << "BamAlignment ERROR: array element type: " << elementType
- << " cannot be stored in integer value" << endl;
- return false;
+ // increase end position on CIGAR chars [DMXN=]
+ case Constants::BAM_CIGAR_DEL_CHAR :
+ case Constants::BAM_CIGAR_MATCH_CHAR :
+ case Constants::BAM_CIGAR_MISMATCH_CHAR :
+ case Constants::BAM_CIGAR_REFSKIP_CHAR :
+ case Constants::BAM_CIGAR_SEQMATCH_CHAR :
+ alignEnd += op.Length;
+ break;
- // unknown tag type
- default:
- cerr << "BamAlignment ERROR: unknown element type encountered: "
- << elementType << endl;
- return false;
- }
+ // increase end position on insertion, only if @usePadded is true
+ case Constants::BAM_CIGAR_INS_CHAR :
+ if ( usePadded )
+ alignEnd += op.Length;
+ break;
- // get number of elements
- int32_t numElements;
- memcpy(&numElements, pTagData, sizeof(int32_t));
- pTagData += 4;
- destination.clear();
- destination.reserve(numElements);
-
- // read in elements
- int32_t value;
- for ( int i = 0 ; i < numElements; ++i ) {
- memcpy(&value, pTagData, sizeof(int32_t));
- pTagData += sizeof(int32_t);
- destination.push_back(value);
+ // all other CIGAR chars do not affect end position
+ default :
+ break;
+ }
}
- // return success
- return false;
+ // adjust for closedInterval, if requested
+ if ( closedInterval )
+ alignEnd -= 1;
+ // return result
+ return alignEnd;
}
-/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector<float>& destination) const
- \brief Retrieves the numeric array data associated with a BAM tag
+/*! \fn std::string BamAlignment::GetErrorString(void) const
+ \brief Returns a human-readable description of the last error that occurred
- \param tag 2-character tag name
- \param destination destination for retrieved data
+ This method allows elimination of STDERR pollution. Developers of client code
+ may choose how the messages are displayed to the user, if at all.
- \return \c true if found
+ \return error description
*/
-bool BamAlignment::GetTag(const std::string& tag, std::vector<float>& destination) const {
-
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLength = TagData.size();
- unsigned int numBytesParsed = 0;
-
- // return false if tag not found
- if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) )
- return false;
+std::string BamAlignment::GetErrorString(void) const {
+ return ErrorString;
+}
- // check that tag is array type
- const char tagType = *(pTagData - 1);
- if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
- cerr << "BamAlignment ERROR: Cannot store non-array data from tag: "
- << tag << " in array destination" << endl;
- return false;
- }
+/*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
+ \brief Identifies if an alignment has a soft clip. If so, identifies the
+ sizes of the soft clips, as well as their positions in the read and reference.
- // calculate length of each element in tag's array
- const char elementType = *pTagData;
- ++pTagData;
- int elementLength = 0;
- switch ( elementType ) {
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- elementLength = sizeof(uint8_t);
- break;
+ \param[out] clipSizes vector of the sizes of each soft clip in the alignment
+ \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
+ These positions are basically indexes within the read, not genomic positions.
+ \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
+ \param[in] usePadded inserted bases affect reported position. Default is false, so that
+ reported position stays 'sync-ed' with reference coordinates.
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- elementLength = sizeof(uint16_t);
- break;
+ \return \c true if any soft clips were found in the alignment
+*/
+bool BamAlignment::GetSoftClips(vector<int>& clipSizes,
+ vector<int>& readPositions,
+ vector<int>& genomePositions,
+ bool usePadded) const
+{
+ // initialize positions & flags
+ int refPosition = Position;
+ int readPosition = 0;
+ bool softClipFound = false;
+ bool firstCigarOp = true;
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- elementLength = sizeof(uint32_t);
- break;
+ // iterate over cigar operations
+ vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+
+ switch ( op.Type ) {
+
+ // increase both read & genome positions on CIGAR chars [DMXN=]
+ case Constants::BAM_CIGAR_DEL_CHAR :
+ case Constants::BAM_CIGAR_MATCH_CHAR :
+ case Constants::BAM_CIGAR_MISMATCH_CHAR :
+ case Constants::BAM_CIGAR_REFSKIP_CHAR :
+ case Constants::BAM_CIGAR_SEQMATCH_CHAR :
+ refPosition += op.Length;
+ readPosition += op.Length;
+ break;
- // unsupported type for float destination (var-length data)
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- case (Constants::BAM_TAG_TYPE_ARRAY) :
- cerr << "BamAlignment ERROR: array element type: " << elementType
- << " cannot be stored in float value" << endl;
- return false;
+ // increase read position on insertion, genome position only if @usePadded is true
+ case Constants::BAM_CIGAR_INS_CHAR :
+ readPosition += op.Length;
+ if ( usePadded )
+ refPosition += op.Length;
+ break;
- // unknown tag type
- default:
- cerr << "BamAlignment ERROR: unknown element type encountered: "
- << elementType << endl;
- return false;
- }
+ case Constants::BAM_CIGAR_SOFTCLIP_CHAR :
+
+ softClipFound = true;
+
+ //////////////////////////////////////////////////////////////////////////////
+ // if we are dealing with the *first* CIGAR operation
+ // for this alignment, we increment the read position so that
+ // the read and genome position of the clip are referring to the same base.
+ // For example, in the alignment below, the ref position would be 4, yet
+ // the read position would be 0. Thus, to "sync" the two,
+ // we need to increment the read position by the length of the
+ // soft clip.
+ // Read: ATCGTTTCGTCCCTGC
+ // Ref: GGGATTTCGTCCCTGC
+ // Cigar: SSSSMMMMMMMMMMMM
+ //
+ // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
+ //////////////////////////////////////////////////////////////////////////////
+ if ( firstCigarOp )
+ readPosition += op.Length;
+
+ // track the soft clip's size, read position, and genome position
+ clipSizes.push_back(op.Length);
+ readPositions.push_back(readPosition);
+ genomePositions.push_back(refPosition);
+
+ // any other CIGAR operations have no effect
+ default :
+ break;
+ }
- // get number of elements
- int32_t numElements;
- memcpy(&numElements, pTagData, sizeof(int32_t));
- pTagData += 4;
- destination.clear();
- destination.reserve(numElements);
-
- // read in elements
- float value;
- for ( int i = 0 ; i < numElements; ++i ) {
- memcpy(&value, pTagData, sizeof(float));
- pTagData += sizeof(float);
- destination.push_back(value);
+ // clear our "first pass" flag
+ firstCigarOp = false;
}
- // return success
- return false;
+ // return whether any soft clips found
+ return softClipFound;
}
/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
\brief Retrieves the BAM tag type-code associated with requested tag name.
- \param tag 2-character tag name
- \param type destination for the retrieved (1-character) tag type
+ \param[in] tag 2-character tag name
+ \param[out] type retrieved (1-character) type-code
\return \c true if found
\sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
*/
bool BamAlignment::GetTagType(const std::string& tag, char& type) const {
- // make sure tag data exists
- if ( SupportData.HasCoreOnly || TagData.empty() )
+ // skip if alignment is core-only
+ if ( SupportData.HasCoreOnly ) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // skip if no tags present
+ if ( TagData.empty() ) {
+ // TODO: set error string?
return false;
+ }
// localize the tag data
char* pTagData = (char*)TagData.data();
const unsigned int tagDataLength = TagData.size();
unsigned int numBytesParsed = 0;
- // lookup tag
- if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
-
- // retrieve tag type code
- type = *(pTagData - 1);
-
- // validate that type is a proper BAM tag type
- switch (type) {
- case (Constants::BAM_TAG_TYPE_ASCII) :
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_STRING) :
- case (Constants::BAM_TAG_TYPE_HEX) :
- case (Constants::BAM_TAG_TYPE_ARRAY) :
- return true;
-
- // unknown tag type
- default:
- cerr << "BamAlignment ERROR: unknown tag type encountered: "
- << type << endl;
- return false;
- }
+ // if tag not found, return failure
+ if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){
+ // TODO: set error string?
+ return false;
+ }
+
+ // otherwise, retrieve & validate tag type code
+ type = *(pTagData - 1);
+ switch ( type ) {
+ case (Constants::BAM_TAG_TYPE_ASCII) :
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_STRING) :
+ case (Constants::BAM_TAG_TYPE_HEX) :
+ case (Constants::BAM_TAG_TYPE_ARRAY) :
+ return true;
+
+ // unknown tag type
+ default:
+ const string message = string("invalid tag type: ") + type;
+ SetErrorString("BamAlignment::GetTagType", message);
+ return false;
}
-
- // tag not found, return failure
- return false;
}
/*! \fn bool BamAlignment::HasTag(const std::string& tag) const
\brief Returns true if alignment has a record for requested tag.
- \param tag 2-character tag name
+
+ \param[in] tag 2-character tag name
\return \c true if alignment has a record for tag
*/
bool BamAlignment::HasTag(const std::string& tag) const {
unsigned int numBytesParsed = 0;
// if result of tag lookup
- return Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed);
+ return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
}
/*! \fn bool BamAlignment::IsDuplicate(void) const
return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 );
}
-/*! \fn bool BamAlignment::RemoveTag(const std::string& tag)
+/*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
+ \internal
+
+ Checks that tag name & type strings are expected sizes.
+
+ \param tag[in] BAM tag name
+ \param type[in] BAM tag type-code
+ \return \c true if both input strings are valid sizes
+*/
+bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const {
+ return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
+ (type.size() == Constants::BAM_TAG_TYPESIZE);
+}
+
+/*! \fn void BamAlignment::RemoveTag(const std::string& tag)
\brief Removes field from BAM tags.
- \return \c true if tag was removed successfully (or didn't exist before)
+ \param[in] tag 2-character name of field to remove
*/
-bool BamAlignment::RemoveTag(const std::string& tag) {
+void BamAlignment::RemoveTag(const std::string& tag) {
- // skip if no tag data available
- if ( SupportData.HasCoreOnly || TagData.empty() )
- return false;
+ // if char data not populated, do that first
+ if ( SupportData.HasCoreOnly )
+ BuildCharData();
+
+ // skip if no tags available
+ if ( TagData.empty() )
+ return;
// localize the tag data
char* pOriginalTagData = (char*)TagData.data();
const unsigned int originalTagDataLength = TagData.size();
unsigned int newTagDataLength = 0;
unsigned int numBytesParsed = 0;
-
- // if tag found
- if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
-
- char newTagData[originalTagDataLength];
-
- // copy original tag data up til desired tag
- pTagData -= 3;
- numBytesParsed -= 3;
- const unsigned int beginningTagDataLength = numBytesParsed;
- newTagDataLength += beginningTagDataLength;
- memcpy(newTagData, pOriginalTagData, numBytesParsed);
-
- // skip to next tag (if tag for removal is last, return true)
- const char* pTagStorageType = pTagData + 2;
- pTagData += 3;
- numBytesParsed += 3;
- if ( !Internal::SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) )
- return true;
-
- // copy everything from current tag (the next one after tag for removal) to end
+
+ // skip if tag not found
+ if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) )
+ return;
+
+ // otherwise, remove it
+ RaiiBuffer newTagData(originalTagDataLength);
+
+ // copy original tag data up til desired tag
+ pTagData -= 3;
+ numBytesParsed -= 3;
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
+
+ // attemp to skip to next tag
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+ if ( SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) {
+
+ // squeeze remaining tag data
const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
- memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
-
- // save new tag data
- TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
- return true;
+ memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength );
+
+ // save modified tag data in alignment
+ TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
}
-
- // tag not found, no removal - return failure
- return false;
+}
+
+/*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
+ \internal
+
+ Sets a formatted error string for this alignment.
+
+ \param[in] where class/method where error occurred
+ \param[in] what description of error
+*/
+void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const {
+ static const string SEPARATOR = ": ";
+ ErrorString = where + SEPARATOR + what;
}
/*! \fn void BamAlignment::SetIsDuplicate(bool ok)
else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
}
-/*! \fn void BamAlignment::SetIsMateUnmapped(bool ok)
- \brief Complement of using SetIsMateMapped().
- \deprecated For sake of symmetry with the query methods
- \sa IsMateMapped(), SetIsMateMapped()
-*/
-void BamAlignment::SetIsMateUnmapped(bool ok) {
- SetIsMateMapped(!ok);
-}
-
/*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
\brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
*/
else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
}
-/*! \fn void BamAlignment::SetIsSecondaryAlignment(bool ok)
- \brief Complement of using SetIsPrimaryAlignment().
- \deprecated For sake of symmetry with the query methods
- \sa IsPrimaryAlignment(), SetIsPrimaryAlignment()
-*/
-void BamAlignment::SetIsSecondaryAlignment(bool ok) {
- SetIsPrimaryAlignment(!ok);
-}
-
/*! \fn void BamAlignment::SetIsSecondMate(bool ok)
\brief Sets "alignment is second mate on read" flag to \a ok.
*/
else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
}
-/*! \fn void BamAlignment::SetIsUnmapped(bool ok)
- \brief Complement of using SetIsMapped().
- \deprecated For sake of symmetry with the query methods
- \sa IsMapped(), SetIsMapped()
+/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
+ \internal
+
+ Moves to next available tag in tag data string
+
+ \param[in] storageType BAM tag type-code that determines how far to move cursor
+ \param[in,out] pTagData pointer to current position (cursor) in tag string
+ \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
+
+ \return \c if storageType was a recognized BAM tag type
+
+ \post \a pTagData will point to the byte where the next tag data begins.
+ \a numBytesParsed will correspond to the cursor's position in the full TagData string.
*/
-void BamAlignment::SetIsUnmapped(bool ok) {
- SetIsMapped(!ok);
+bool BamAlignment::SkipToNextTag(const char storageType,
+ char*& pTagData,
+ unsigned int& numBytesParsed) const
+{
+ switch (storageType) {
+
+ case (Constants::BAM_TAG_TYPE_ASCII) :
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ numBytesParsed += sizeof(uint16_t);
+ pTagData += sizeof(uint16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ numBytesParsed += sizeof(uint32_t);
+ pTagData += sizeof(uint32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_STRING) :
+ case (Constants::BAM_TAG_TYPE_HEX) :
+ while( *pTagData ) {
+ ++numBytesParsed;
+ ++pTagData;
+ }
+ // increment for null-terminator
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_ARRAY) :
+
+ {
+ // read array type
+ const char arrayType = *pTagData;
+ ++numBytesParsed;
+ ++pTagData;
+
+ // read number of elements
+ int32_t numElements;
+ memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
+ numBytesParsed += sizeof(uint32_t);
+ pTagData += sizeof(uint32_t);
+
+ // calculate number of bytes to skip
+ int bytesToSkip = 0;
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ bytesToSkip = numElements;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ bytesToSkip = numElements*sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ bytesToSkip = numElements*sizeof(uint32_t);
+ break;
+ default:
+ const string message = string("invalid binary array type: ") + arrayType;
+ SetErrorString("BamAlignment::SkipToNextTag", message);
+ return false;
+ }
+
+ // skip binary array contents
+ numBytesParsed += bytesToSkip;
+ pTagData += bytesToSkip;
+ break;
+ }
+
+ default:
+ const string message = string("invalid tag type: ") + storageType;
+ SetErrorString("BamAlignment::SkipToNextTag", message);
+ return false;
+ }
+
+ // if we get here, tag skipped OK - return success
+ return true;
}