From: derek Date: Tue, 19 Apr 2011 23:01:01 +0000 (-0400) Subject: Brought API up to compliance with recent SAM Format Spec (v1.4-r962) X-Git-Url: https://git.donarmstrong.com/?p=bamtools.git;a=commitdiff_plain;h=cdf4bbcb19025398d429035fe672661a8c8d1a80 Brought API up to compliance with recent SAM Format Spec (v1.4-r962) * Added support for new "binary array" tag type * Added support for '=' and 'X' CIGAR ops * Added support for multiple PG entries in header * Added support for new RG fields --- diff --git a/docs/Doxyfile b/docs/Doxyfile index 4ce1cc2..ea3a6a8 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -189,7 +189,7 @@ TAB_SIZE = 1 # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. -ALIASES = +ALIASES = samSpecURL=http://samtools.sourceforge.net/SAM1.pdf # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 162e195..7cff4b0 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -17,6 +17,7 @@ using namespace BamTools; #include #include #include +#include #include #include using namespace std; @@ -68,20 +69,20 @@ bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numByt case (Constants::BAM_TAG_TYPE_INT16) : case (Constants::BAM_TAG_TYPE_UINT16) : - numBytesParsed += 2; - pTagData += 2; + numBytesParsed += sizeof(uint16_t); + pTagData += sizeof(uint16_t); break; case (Constants::BAM_TAG_TYPE_FLOAT) : case (Constants::BAM_TAG_TYPE_INT32) : case (Constants::BAM_TAG_TYPE_UINT32) : - numBytesParsed += 4; - pTagData += 4; + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); break; case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : - while(*pTagData) { + while( *pTagData ) { ++numBytesParsed; ++pTagData; } @@ -90,9 +91,51 @@ bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numByt ++pTagData; break; + case (Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = *pTagData; + ++numBytesParsed; + ++pTagData; + + // read number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped if necessary + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + + // calculate number of bytes to skip + int bytesToSkip = 0; + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + bytesToSkip = numElements; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + bytesToSkip = numElements*sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + bytesToSkip = numElements*sizeof(uint32_t); + break; + default: + cerr << "BamAlignment ERROR: unknown binary array type encountered: " + << arrayType << endl; + return false; + } + + // skip binary array contents + numBytesParsed += bytesToSkip; + pTagData += bytesToSkip; + break; + } + default: - // error case - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", storageType); + cerr << "BamAlignment ERROR: unknown tag type encountered" + << storageType << endl; return false; } @@ -249,9 +292,7 @@ BamAlignment::~BamAlignment(void) { } \param value string data to store \return \c true if the \b new tag was added successfully - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { @@ -261,8 +302,11 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const // validate tag/type size & that type is OK for string value if ( !Internal::IsValidSize(tag, type) ) return false; if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && - type.at(0) != Constants::BAM_TAG_TYPE_HEX ) + type.at(0) != Constants::BAM_TAG_TYPE_HEX + ) + { return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -297,12 +341,11 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") \param value unsigned int data to store \return \c true if the \b new tag was added successfully - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { @@ -311,10 +354,14 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const // validate tag/type size & that type is OK for uint32_t value if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || + if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -354,13 +401,11 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") \param value signed int data to store \return \c true if the \b new tag was added successfully - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { return AddTag(tag, type, (const uint32_t&)value); @@ -372,13 +417,11 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. \param tag 2-character tag name - \param type 1-character tag type (must NOT be "Z" or "H") + \param type 1-character tag type (must NOT be "Z", "H", or "B") \param value float data to store \return \c true if the \b new tag was added successfully - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { @@ -388,8 +431,12 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const // validate tag/type size & that type is OK for float value if ( !Internal::IsValidSize(tag, type) ) return false; if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -423,6 +470,461 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const return true; } +/*! \fn bool AddTag(const std::string& tag, const std::vector& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of uint8_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_UINT8; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(uint8_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const uint8_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint8_t), + &value, sizeof(uint8_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of int8_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_INT8; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(int8_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const int8_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(int8_t), + &value, sizeof(int8_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of uint16_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_UINT16; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(uint16_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const uint16_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint16_t), + &value, sizeof(uint16_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of int16_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_INT16; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(int16_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const int16_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(int16_t), + &value, sizeof(int16_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of uint32_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_UINT32; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(uint32_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const uint32_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint32_t), + &value, sizeof(uint32_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of int32_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_INT32; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(int32_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const int32_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(int32_t), + &value, sizeof(int32_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of float values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_FLOAT; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(float); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const float value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(float), + &value, sizeof(float)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + /*! \fn bool BamAlignment::BuildCharData(void) \brief Populates alignment string fields (read name, bases, qualities, tag data). @@ -504,9 +1006,11 @@ bool BamAlignment::BuildCharData(void) { switch (op.Type) { - // for 'M', 'I' - write bases - case (Constants::BAM_CIGAR_MATCH_CHAR) : - case (Constants::BAM_CIGAR_INS_CHAR) : + // for 'M', 'I', '=', 'X' - write bases + case (Constants::BAM_CIGAR_MATCH_CHAR) : + case (Constants::BAM_CIGAR_INS_CHAR) : + case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : + case (Constants::BAM_CIGAR_MISMATCH_CHAR) : AlignedBases.append(QueryBases.substr(k, op.Length)); // fall through @@ -537,7 +1041,8 @@ bool BamAlignment::BuildCharData(void) { // shouldn't get here default: - fprintf(stderr, "BamAlignment ERROR: invalid CIGAR operation type: %c\n", op.Type); + cerr << "BamAlignment ERROR: invalid CIGAR operation type: " + << op.Type << endl; exit(1); } } @@ -559,6 +1064,7 @@ bool BamAlignment::BuildCharData(void) { case(Constants::BAM_TAG_TYPE_ASCII) : case(Constants::BAM_TAG_TYPE_INT8) : case(Constants::BAM_TAG_TYPE_UINT8) : + // no endian swapping necessary for single-byte data ++i; break; @@ -578,14 +1084,59 @@ bool BamAlignment::BuildCharData(void) { case(Constants::BAM_TAG_TYPE_HEX) : case(Constants::BAM_TAG_TYPE_STRING) : // no endian swapping necessary for hex-string/string data - while (tagData[i]) { ++i; } + while ( tagData[i] ) + ++i; // increment one more for null terminator ++i; break; + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + // error case + cerr << "BamAlignment ERROR: unknown binary array type encountered: " + << arrayType << endl; + return false; + } + } + + break; + } + // shouldn't get here default : - fprintf(stderr, "BamAlignment ERROR: invalid tag value type: %c\n", type); + cerr << "BamAlignment ERROR: invalid tag value type: " + << type << endl; exit(1); } } @@ -615,8 +1166,7 @@ bool BamAlignment::BuildCharData(void) { \return \c true if the tag was modified/created successfully \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { @@ -637,7 +1187,7 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - // if tag found, store data in readGroup, return success + // if tag found if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { // make sure array is more than big enough @@ -648,7 +1198,7 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons newTagDataLength += beginningTagDataLength; memcpy(newTagData, pOriginalTagData, numBytesParsed); - // copy new VALUE in place of current tag data + // copy new @value in place of current tag data const unsigned int dataLength = strlen(value.c_str()); memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); @@ -681,14 +1231,13 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons If \a tag does not exist, a new entry is created. \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") \param value unsigned integer data to store \return \c true if the tag was modified/created successfully \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { @@ -697,10 +1246,14 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons // validate tag/type size & that type is OK for uint32_t value if ( !Internal::IsValidSize(tag, type) ) return false; - if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || + if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { return false; + } // localize the tag data char* pOriginalTagData = (char*)TagData.data(); @@ -710,7 +1263,7 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - // if tag found, store data in readGroup, return success + // if tag found if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { // make sure array is more than big enough @@ -721,7 +1274,7 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons newTagDataLength += beginningTagDataLength; memcpy(newTagData, pOriginalTagData, numBytesParsed); - // copy new VALUE in place of current tag data + // copy new @value in place of current tag data union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; un.value = value; memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(uint32_t)); @@ -755,14 +1308,13 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons If \a tag does not exist, a new entry is created. \param tag 2-character tag name - \param type 1-character tag type (must NOT be "f", "Z", or "H") + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") \param value signed integer data to store \return \c true if the tag was modified/created successfully \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { return EditTag(tag, type, (const uint32_t&)value); @@ -774,14 +1326,13 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons If \a tag does not exist, a new entry is created. \param tag 2-character tag name - \param type 1-character tag type (must NOT be "Z" or "H") + \param type 1-character tag type (must NOT be "Z", "H", or "B") \param value float data to store \return \c true if the tag was modified/created successfully \sa BamAlignment::RemoveTag() - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { @@ -791,8 +1342,12 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons // validate tag/type size & that type is OK for float value if ( !Internal::IsValidSize(tag, type) ) return false; if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || - type.at(0) == Constants::BAM_TAG_TYPE_HEX ) + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { return false; + } // localize the tag data char* pOriginalTagData = (char*)TagData.data(); @@ -802,7 +1357,7 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - // if tag found, store data in readGroup, return success + // if tag found if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { // make sure array is more than big enough @@ -813,7 +1368,7 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons newTagDataLength += beginningTagDataLength; memcpy(newTagData, pOriginalTagData, numBytesParsed); - // copy new VALUE in place of current tag data + // copy new @value in place of current tag data union { float value; char valueBuffer[sizeof(float)]; } un; un.value = value; memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); @@ -841,6 +1396,181 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons else return AddTag(tag, type, value); } +/*! \fn bool EditTag(const std::string& tag, const std::vector& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of uint8_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of int8_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of uint16_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of int16_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of uint32_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of int32_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of float values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + /*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const \brief Retrieves value of edit distance tag ("NM"). @@ -929,7 +1659,7 @@ bool BamAlignment::GetTag(const std::string& tag, std::string& destination) cons const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - // if tag found, store data in readGroup, return success + // if tag found if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { const unsigned int dataLength = strlen(pTagData); destination.clear(); @@ -961,7 +1691,7 @@ bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - // if tag found, determine data byte-length, store data in readGroup, return success + // if tag found if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { // determine data byte-length @@ -992,12 +1722,15 @@ bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { case (Constants::BAM_TAG_TYPE_FLOAT) : case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : - fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in integer destination\n", type); + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: cannot store tag of type " << type + << " in integer destination" << endl; return false; // unknown tag type default: - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); + cerr << "BamAlignment ERROR: unknown tag type encountered: " + << type << endl; return false; } @@ -1042,7 +1775,7 @@ bool BamAlignment::GetTag(const std::string& tag, float& destination) const { const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - // if tag found, determine data byte-length, store data in readGroup, return success + // if tag found if ( Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { // determine data byte-length @@ -1073,12 +1806,15 @@ bool BamAlignment::GetTag(const std::string& tag, float& destination) const { // unsupported type (var-length strings) case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : - fprintf(stderr, "BamAlignment ERROR: cannot store tag of type %c in float destination\n", type); + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: cannot store tag of type " << type + << " in float destination" << endl; return false; // unknown tag type default: - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); + cerr << "BamAlignment ERROR: unknown tag type encountered: " + << type << endl; return false; } @@ -1092,6 +1828,268 @@ bool BamAlignment::GetTag(const std::string& tag, float& destination) const { return false; } +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const + \brief Retrieves the numeric array data associated with a BAM tag + + \param tag 2-character tag name + \param destination destination for retrieved data + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return false if tag not found + if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // check that tag is array type + const char tagType = *(pTagData - 1); + if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { + cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " + << tag << " in array destination" << endl; + return false; + } + + // calculate length of each element in tag's array + const char elementType = *pTagData; + ++pTagData; + int elementLength = 0; + switch ( elementType ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + elementLength = sizeof(uint8_t); + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + elementLength = sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + elementLength = sizeof(uint32_t); + break; + + // unsupported type for integer destination (float or var-length data) + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: array element type: " << elementType + << " cannot be stored in integer value" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown element type encountered: " + << elementType << endl; + return false; + } + + // get number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(int32_t)); + pTagData += 4; + destination.clear(); + destination.reserve(numElements); + + // read in elements + uint32_t value; + for ( int i = 0 ; i < numElements; ++i ) { + memcpy(&value, pTagData, sizeof(uint32_t)); + pTagData += sizeof(uint32_t); + destination.push_back(value); + } + + // return success + return false; +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const + \brief Retrieves the numeric array data associated with a BAM tag + + \param tag 2-character tag name + \param destination destination for retrieved data + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return false if tag not found + if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // check that tag is array type + const char tagType = *(pTagData - 1); + if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { + cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " + << tag << " in array destination" << endl; + return false; + } + + // calculate length of each element in tag's array + const char elementType = *pTagData; + ++pTagData; + int elementLength = 0; + switch ( elementType ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + elementLength = sizeof(uint8_t); + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + elementLength = sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + elementLength = sizeof(uint32_t); + break; + + // unsupported type for integer destination (float or var-length data) + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: array element type: " << elementType + << " cannot be stored in integer value" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown element type encountered: " + << elementType << endl; + return false; + } + + // get number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(int32_t)); + pTagData += 4; + destination.clear(); + destination.reserve(numElements); + + // read in elements + int32_t value; + for ( int i = 0 ; i < numElements; ++i ) { + memcpy(&value, pTagData, sizeof(int32_t)); + pTagData += sizeof(int32_t); + destination.push_back(value); + } + + // return success + return false; + +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const + \brief Retrieves the numeric array data associated with a BAM tag + + \param tag 2-character tag name + \param destination destination for retrieved data + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return false if tag not found + if ( !Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // check that tag is array type + const char tagType = *(pTagData - 1); + if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { + cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " + << tag << " in array destination" << endl; + return false; + } + + // calculate length of each element in tag's array + const char elementType = *pTagData; + ++pTagData; + int elementLength = 0; + switch ( elementType ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + elementLength = sizeof(uint8_t); + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + elementLength = sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + case (Constants::BAM_TAG_TYPE_FLOAT) : + elementLength = sizeof(uint32_t); + break; + + // unsupported type for float destination (var-length data) + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: array element type: " << elementType + << " cannot be stored in float value" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown element type encountered: " + << elementType << endl; + return false; + } + + // get number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(int32_t)); + pTagData += 4; + destination.clear(); + destination.reserve(numElements); + + // read in elements + float value; + for ( int i = 0 ; i < numElements; ++i ) { + memcpy(&value, pTagData, sizeof(float)); + pTagData += sizeof(float); + destination.push_back(value); + } + + // return success + return false; +} + /*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const \brief Retrieves the BAM tag type-code associated with requested tag name. @@ -1099,9 +2097,7 @@ bool BamAlignment::GetTag(const std::string& tag, float& destination) const { \param type destination for the retrieved (1-character) tag type \return \c true if found - - \sa http://samtools.sourceforge.net/SAM-1.3.pdf - for more details on reserved tag names, supported tag types, etc. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. */ bool BamAlignment::GetTagType(const std::string& tag, char& type) const { @@ -1132,11 +2128,13 @@ bool BamAlignment::GetTagType(const std::string& tag, char& type) const { case (Constants::BAM_TAG_TYPE_FLOAT) : case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : return true; // unknown tag type default: - fprintf(stderr, "BamAlignment ERROR: unknown tag type encountered: [%c]\n", type); + cerr << "BamAlignment ERROR: unknown tag type encountered: " + << type << endl; return false; } } @@ -1145,6 +2143,26 @@ bool BamAlignment::GetTagType(const std::string& tag, char& type) const { return false; } +/*! \fn bool BamAlignment::HasTag(const std::string& tag) const + \brief Returns true if alignment has a record for requested tag. + \param tag 2-character tag name + \return \c true if alignment has a record for tag +*/ +bool BamAlignment::HasTag(const std::string& tag) const { + + // return false if no tag data present + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data for lookup + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if result of tag lookup + return Internal::FindTag(tag, pTagData, tagDataLength, numBytesParsed); +} + /*! \fn bool BamAlignment::IsDuplicate(void) const \return \c true if this read is a PCR duplicate */ @@ -1229,8 +2247,7 @@ bool BamAlignment::IsSecondMate(void) const { */ bool BamAlignment::RemoveTag(const std::string& tag) { - // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed - // also, return false if no data present to remove + // skip if no tag data available if ( SupportData.HasCoreOnly || TagData.empty() ) return false; @@ -1241,7 +2258,7 @@ bool BamAlignment::RemoveTag(const std::string& tag) { unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - // if tag found, store data in readGroup, return success + // if tag found if ( Internal::FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { char newTagData[originalTagDataLength]; diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h index fb54b1a..1123378 100644 --- a/src/api/BamAlignment.h +++ b/src/api/BamAlignment.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -84,19 +84,42 @@ struct API_EXPORT BamAlignment { bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); bool AddTag(const std::string& tag, const std::string& type, const float& value); - + + // adds a "binary array" tag + bool AddTag(const std::string& tag, const std::vector& values); + bool AddTag(const std::string& tag, const std::vector& values); + bool AddTag(const std::string& tag, const std::vector& values); + bool AddTag(const std::string& tag, const std::vector& values); + bool AddTag(const std::string& tag, const std::vector& values); + bool AddTag(const std::string& tag, const std::vector& values); + bool AddTag(const std::string& tag, const std::vector& values); + // edits a tag bool EditTag(const std::string& tag, const std::string& type, const std::string& value); bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); bool EditTag(const std::string& tag, const std::string& type, const float& value); + // edits a "binary array" tag + bool EditTag(const std::string& tag, const std::vector& values); + bool EditTag(const std::string& tag, const std::vector& values); + bool EditTag(const std::string& tag, const std::vector& values); + bool EditTag(const std::string& tag, const std::vector& values); + bool EditTag(const std::string& tag, const std::vector& values); + bool EditTag(const std::string& tag, const std::vector& values); + bool EditTag(const std::string& tag, const std::vector& values); + // retrieves data for a tag bool GetTag(const std::string& tag, std::string& destination) const; bool GetTag(const std::string& tag, uint32_t& destination) const; bool GetTag(const std::string& tag, int32_t& destination) const; bool GetTag(const std::string& tag, float& destination) const; + // retrieves data for a "binary array" tag + bool GetTag(const std::string& tag, std::vector& destination) const; + bool GetTag(const std::string& tag, std::vector& destination) const; + bool GetTag(const std::string& tag, std::vector& destination) const; + // retrieves the BAM tag-type character for a tag bool GetTagType(const std::string& tag, char& type) const; @@ -104,6 +127,9 @@ struct API_EXPORT BamAlignment { bool GetEditDistance(uint32_t& editDistance) const; // retrieves value of "NM" tag bool GetReadGroup(std::string& readGroup) const; // retrieves value of "RG" tag + // returns true if alignment has a record for this tag name + bool HasTag(const std::string& tag) const; + // removes a tag bool RemoveTag(const std::string& tag); @@ -127,7 +153,7 @@ struct API_EXPORT BamAlignment { uint16_t Bin; // BAM (standard) index bin number for this alignment uint16_t MapQuality; // mapping quality score uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify) - std::vector CigarData; // CIGAR operations for this alignment + std::vector CigarData; // CIGAR operations for this alignment int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned int32_t MatePosition; // position (0-based) where alignment's mate starts int32_t InsertSize; // mate-pair insert size diff --git a/src/api/BamConstants.h b/src/api/BamConstants.h index 58b0e49..e433c8e 100644 --- a/src/api/BamConstants.h +++ b/src/api/BamConstants.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 5 April 2011 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides basic constants for handling BAM files. // *************************************************************************** @@ -31,20 +31,20 @@ const int BAM_CORE_SIZE = 32; const int BAM_CORE_BUFFER_SIZE = 8; // BAM alignment flags -const int BAM_ALIGNMENT_PAIRED = 1; -const int BAM_ALIGNMENT_PROPER_PAIR = 2; -const int BAM_ALIGNMENT_UNMAPPED = 4; -const int BAM_ALIGNMENT_MATE_UNMAPPED = 8; -const int BAM_ALIGNMENT_REVERSE_STRAND = 16; -const int BAM_ALIGNMENT_MATE_REVERSE_STRAND = 32; -const int BAM_ALIGNMENT_READ_1 = 64; -const int BAM_ALIGNMENT_READ_2 = 128; -const int BAM_ALIGNMENT_SECONDARY = 256; -const int BAM_ALIGNMENT_QC_FAILED = 512; -const int BAM_ALIGNMENT_DUPLICATE = 1024; +const int BAM_ALIGNMENT_PAIRED = 0x0001; +const int BAM_ALIGNMENT_PROPER_PAIR = 0x0002; +const int BAM_ALIGNMENT_UNMAPPED = 0x0004; +const int BAM_ALIGNMENT_MATE_UNMAPPED = 0x0008; +const int BAM_ALIGNMENT_REVERSE_STRAND = 0x0010; +const int BAM_ALIGNMENT_MATE_REVERSE_STRAND = 0x0020; +const int BAM_ALIGNMENT_READ_1 = 0x0040; +const int BAM_ALIGNMENT_READ_2 = 0x0080; +const int BAM_ALIGNMENT_SECONDARY = 0x0100; +const int BAM_ALIGNMENT_QC_FAILED = 0x0200; +const int BAM_ALIGNMENT_DUPLICATE = 0x0400; // CIGAR constants -const char* const BAM_CIGAR_LOOKUP = "MIDNSHP"; +const char* const BAM_CIGAR_LOOKUP = "MIDNSHP=X"; const int BAM_CIGAR_MATCH = 0; const int BAM_CIGAR_INS = 1; const int BAM_CIGAR_DEL = 2; @@ -52,8 +52,8 @@ const int BAM_CIGAR_REFSKIP = 3; const int BAM_CIGAR_SOFTCLIP = 4; const int BAM_CIGAR_HARDCLIP = 5; const int BAM_CIGAR_PAD = 6; - -// TODO: Add support for 'X' and '=' ops +const int BAM_CIGAR_SEQMATCH = 7; +const int BAM_CIGAR_MISMATCH = 8; const char BAM_CIGAR_MATCH_CHAR = 'M'; const char BAM_CIGAR_INS_CHAR = 'I'; @@ -62,6 +62,8 @@ const char BAM_CIGAR_REFSKIP_CHAR = 'N'; const char BAM_CIGAR_SOFTCLIP_CHAR = 'S'; const char BAM_CIGAR_HARDCLIP_CHAR = 'H'; const char BAM_CIGAR_PAD_CHAR = 'P'; +const char BAM_CIGAR_SEQMATCH_CHAR = '='; +const char BAM_CIGAR_MISMATCH_CHAR = 'X'; const int BAM_CIGAR_SHIFT = 4; const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); @@ -77,9 +79,11 @@ const char BAM_TAG_TYPE_INT32 = 'I'; const char BAM_TAG_TYPE_FLOAT = 'f'; const char BAM_TAG_TYPE_STRING = 'Z'; const char BAM_TAG_TYPE_HEX = 'H'; +const char BAM_TAG_TYPE_ARRAY = 'B'; const size_t BAM_TAG_TAGSIZE = 2; const size_t BAM_TAG_TYPESIZE = 1; +const int BAM_TAG_ARRAYBASE_SIZE = 8; // DNA bases const char* const BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index a09d9a1..0c58755 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -18,6 +18,8 @@ set( BamToolsAPISources BamReader.cpp BamWriter.cpp SamHeader.cpp + SamProgram.cpp + SamProgramChain.cpp SamReadGroup.cpp SamReadGroupDictionary.cpp SamSequence.cpp @@ -38,7 +40,7 @@ set( BamToolsAPISources # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) -set_target_properties( BamTools PROPERTIES SOVERSION "1.0.0" ) +set_target_properties( BamTools PROPERTIES SOVERSION "1.0.1" ) set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) # create main BamTools API static library @@ -67,6 +69,8 @@ ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgram.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgramChain.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir}) ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir}) diff --git a/src/api/SamConstants.h b/src/api/SamConstants.h index 6412b3d..d345920 100644 --- a/src/api/SamConstants.h +++ b/src/api/SamConstants.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides constants for SAM header // *************************************************************************** @@ -17,6 +17,7 @@ namespace BamTools { namespace Constants { +// basic char constants used in SAM format const char SAM_COLON = ':'; const char SAM_EQUAL = '='; const char SAM_PERIOD = '.'; @@ -32,30 +33,35 @@ const std::string SAM_HD_GROUPORDER_TAG = "GO"; // SQ entries const std::string SAM_SQ_BEGIN_TOKEN = "@SQ"; -const std::string SAM_SQ_NAME_TAG = "SN"; -const std::string SAM_SQ_LENGTH_TAG = "LN"; const std::string SAM_SQ_ASSEMBLYID_TAG = "AS"; -const std::string SAM_SQ_URI_TAG = "UR"; const std::string SAM_SQ_CHECKSUM_TAG = "M5"; +const std::string SAM_SQ_LENGTH_TAG = "LN"; +const std::string SAM_SQ_NAME_TAG = "SN"; const std::string SAM_SQ_SPECIES_TAG = "SP"; +const std::string SAM_SQ_URI_TAG = "UR"; // RG entries const std::string SAM_RG_BEGIN_TOKEN = "@RG"; +const std::string SAM_RG_DESCRIPTION_TAG = "DS"; +const std::string SAM_RG_FLOWORDER_TAG = "FO"; const std::string SAM_RG_ID_TAG = "ID"; -const std::string SAM_RG_SAMPLE_TAG = "SM"; +const std::string SAM_RG_KEYSEQUENCE_TAG = "KS"; const std::string SAM_RG_LIBRARY_TAG = "LB"; -const std::string SAM_RG_DESCRIPTION_TAG = "DS"; const std::string SAM_RG_PLATFORMUNIT_TAG = "PU"; const std::string SAM_RG_PREDICTEDINSERTSIZE_TAG = "PI"; -const std::string SAM_RG_SEQCENTER_TAG = "CN"; const std::string SAM_RG_PRODUCTIONDATE_TAG = "DT"; +const std::string SAM_RG_PROGRAM_TAG = "PG"; +const std::string SAM_RG_SAMPLE_TAG = "SM"; +const std::string SAM_RG_SEQCENTER_TAG = "CN"; const std::string SAM_RG_SEQTECHNOLOGY_TAG = "PL"; // PG entries -const std::string SAM_PG_BEGIN_TOKEN = "@PG"; -const std::string SAM_PG_NAME_TAG = "ID"; -const std::string SAM_PG_VERSION_TAG = "VN"; -const std::string SAM_PG_COMMANDLINE_TAG = "CL"; +const std::string SAM_PG_BEGIN_TOKEN = "@PG"; +const std::string SAM_PG_COMMANDLINE_TAG = "CL"; +const std::string SAM_PG_ID_TAG = "ID"; +const std::string SAM_PG_NAME_TAG = "PN"; +const std::string SAM_PG_PREVIOUSPROGRAM_TAG = "PP"; +const std::string SAM_PG_VERSION_TAG = "VN"; // CO entries const std::string SAM_CO_BEGIN_TOKEN = "@CO"; @@ -63,6 +69,7 @@ const std::string SAM_CO_BEGIN_TOKEN = "@CO"; // HD:SO values const std::string SAM_HD_SORTORDER_COORDINATE = "coordinate"; const std::string SAM_HD_SORTORDER_QUERYNAME = "queryname"; +const std::string SAM_HD_SORTORDER_UNKNOWN = "unknown"; const std::string SAM_HD_SORTORDER_UNSORTED = "unsorted"; // HD:GO values @@ -74,29 +81,14 @@ const std::string SAM_HD_GROUPORDER_REFERENCE = "reference"; const unsigned int SAM_SQ_LENGTH_MIN = 1; const unsigned int SAM_SQ_LENGTH_MAX = 536870911; // 2^29 - 1 -// -------------- // RG:PL values - -// 454 -const std::string SAM_RG_SEQTECHNOLOGY_454 = "454"; -const std::string SAM_RG_SEQTECHNOLOGY_LS454_LOWER = "ls454"; -const std::string SAM_RG_SEQTECHNOLOGY_LS454_UPPER = "LS454"; - -// Helicos -const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER = "helicos"; -const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER = "HELICOS"; - -// Illumina -const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER = "illumina"; -const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER = "ILLUMINA"; - -// PacBio -const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER = "pacbio"; -const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER = "PACBIO"; - -// SOLiD -const std::string SAM_RG_SEQTECHNOLOGY_SOLID_LOWER = "solid"; -const std::string SAM_RG_SEQTECHNOLOGY_SOLID_UPPER = "SOLID"; +const std::string SAM_RG_SEQTECHNOLOGY_CAPILLARY = "CAPILLARY"; +const std::string SAM_RG_SEQTECHNOLOGY_HELICOS = "HELICOS"; +const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA = "ILLUMINA"; +const std::string SAM_RG_SEQTECHNOLOGY_IONTORRENT = "IONTORRENT"; +const std::string SAM_RG_SEQTECHNOLOGY_LS454 = "LS454"; +const std::string SAM_RG_SEQTECHNOLOGY_PACBIO = "PACBIO"; +const std::string SAM_RG_SEQTECHNOLOGY_SOLID = "SOLID"; } // namespace Constants } // namespace BamTools diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp index 7a69162..9104978 100644 --- a/src/api/SamHeader.cpp +++ b/src/api/SamHeader.cpp @@ -3,11 +3,12 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM header data fields. // *************************************************************************** +#include #include #include #include @@ -21,10 +22,12 @@ using namespace std; Provides direct read/write access to the SAM header data fields. - \sa http://samtools.sourceforge.net/SAM-1.3.pdf + \sa \samSpecURL */ /*! \var SamHeader::Version \brief corresponds to \@HD VN:\ + + Required for valid SAM header, if @HD record is present. */ /*! \var SamHeader::SortOrder \brief corresponds to \@HD SO:\ @@ -58,11 +61,8 @@ using namespace std; */ SamHeader::SamHeader(const std::string& headerText) : Version("") - , SortOrder("") + , SortOrder(Constants::SAM_HD_SORTORDER_UNKNOWN) , GroupOrder("") - , ProgramName("") - , ProgramVersion("") - , ProgramCommandLine("") { SamFormatParser parser(*this); parser.Parse(headerText); @@ -77,9 +77,7 @@ SamHeader::SamHeader(const SamHeader& other) , GroupOrder(other.GroupOrder) , Sequences(other.Sequences) , ReadGroups(other.ReadGroups) - , ProgramName(other.ProgramName) - , ProgramVersion(other.ProgramVersion) - , ProgramCommandLine(other.ProgramCommandLine) + , Programs(other.Programs) { } /*! \fn SamHeader::~SamHeader(void) @@ -96,9 +94,7 @@ void SamHeader::Clear(void) { GroupOrder.clear(); Sequences.Clear(); ReadGroups.Clear(); - ProgramName.clear(); - ProgramVersion.clear(); - ProgramCommandLine.clear(); + Programs.Clear(); Comments.clear(); } @@ -137,25 +133,11 @@ bool SamHeader::HasReadGroups(void) const { return (!ReadGroups.IsEmpty()); } -/*! \fn bool SamHeader::HasProgramName(void) const - \brief Returns \c true if header contains \@PG ID:\ -*/ -bool SamHeader::HasProgramName(void) const { - return (!ProgramName.empty()); -} - -/*! \fn bool SamHeader::HasProgramVersion(void) const - \brief Returns \c true if header contains \@PG VN:\ -*/ -bool SamHeader::HasProgramVersion(void) const { - return (!ProgramVersion.empty()); -} - -/*! \fn bool SamHeader::HasProgramCommandLine(void) const - \brief Returns \c true if header contains \@PG CL:\ +/*! \fn bool SamHeader::HasPrograms(void) const + \brief Returns \c true if header contains any \@PG entries */ -bool SamHeader::HasProgramCommandLine(void) const { - return (!ProgramCommandLine.empty()); +bool SamHeader::HasPrograms(void) const { + return (!Programs.IsEmpty()); } /*! \fn bool SamHeader::HasComments(void) const diff --git a/src/api/SamHeader.h b/src/api/SamHeader.h index 3ff4946..5c7a101 100644 --- a/src/api/SamHeader.h +++ b/src/api/SamHeader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM header data fields. // *************************************************************************** @@ -12,6 +12,7 @@ #define SAM_HEADER_H #include +#include #include #include #include @@ -38,17 +39,17 @@ struct API_EXPORT SamHeader { bool HasGroupOrder(void) const; // returns true if header contains group order entry bool HasSequences(void) const; // returns true if header contains any sequence entries bool HasReadGroups(void) const; // returns true if header contains any read group entries - bool HasProgramName(void) const; // returns true if header contains program name - bool HasProgramVersion(void) const; // returns true if header contains program version - bool HasProgramCommandLine(void) const; // returns true if header contains program command line + bool HasPrograms(void) const; // returns true if header contains any program record entries bool HasComments(void) const; // returns true if header contains comments + // -------------- // data members + // -------------- // header metadata (@HD line) - std::string Version; // VN: - std::string SortOrder; // SO: - std::string GroupOrder; // GO: + std::string Version; // VN: *Required for valid SAM header, if @HD record is present* + std::string SortOrder; // SO: + std::string GroupOrder; // GO: // header sequences (@SQ entries) SamSequenceDictionary Sequences; @@ -57,9 +58,7 @@ struct API_EXPORT SamHeader { SamReadGroupDictionary ReadGroups; // header program data (@PG entries) - std::string ProgramName; // ID: - std::string ProgramVersion; // VN: - std::string ProgramCommandLine; // CL: + SamProgramChain Programs; // header comments (@CO entries) std::vector Comments; diff --git a/src/api/SamProgram.cpp b/src/api/SamProgram.cpp new file mode 100644 index 0000000..b89a10b --- /dev/null +++ b/src/api/SamProgram.cpp @@ -0,0 +1,140 @@ +// *************************************************************************** +// SamProgram.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header program records. +// *************************************************************************** + +#include +using namespace BamTools; +using namespace std; + +/*! \struct BamTools::SamProgram + \brief Represents a SAM program record. + + Provides direct read/write access to the SAM header program records. + + \sa \samSpecURL +*/ +/*! \var SamProgram::CommandLine + \brief corresponds to \@PG CL:\ +*/ +/*! \var SamProgram::ID + \brief corresponds to \@PG ID:\ + + Required for valid SAM header. +*/ +/*! \var SamProgram::Name + \brief corresponds to \@PG PN:\ +*/ +/*! \var SamProgram::PreviousProgramID + \brief corresponds to \@PG PP:\ +*/ +/*! \var SamProgram::Version + \brief corresponds to \@PG VN:\ +*/ +/*! \var SamProgram::NextProgramID + \internal + Holds ID of the "next" program record in a SamProgramChain +*/ + +/*! \fn SamProgram::SamProgram(void) + \brief default constructor +*/ +SamProgram::SamProgram(void) + : CommandLine("") + , ID("") + , Name("") + , PreviousProgramID("") + , Version("") + , NextProgramID("") +{ } + +/*! \fn SamProgram::SamProgram(const std::string& id) + \brief constructs program record with \a id + + \param id desired program record ID +*/ +SamProgram::SamProgram(const std::string& id) + : CommandLine("") + , ID(id) + , Name("") + , PreviousProgramID("") + , Version("") + , NextProgramID("") +{ } + +/*! \fn SamProgram::SamProgram(const SamProgram& other) + \brief copy constructor +*/ +SamProgram::SamProgram(const SamProgram& other) + : CommandLine(other.CommandLine) + , ID(other.ID) + , Name(other.Name) + , PreviousProgramID(other.PreviousProgramID) + , Version(other.Version) + , NextProgramID(other.NextProgramID) +{ } + +/*! \fn SamProgram::~SamProgram(void) + \brief destructor +*/ +SamProgram::~SamProgram(void) { } + +/*! \fn void SamProgram::Clear(void) + \brief Clears all data fields. +*/ +void SamProgram::Clear(void) { + CommandLine.clear(); + ID.clear(); + Name.clear(); + PreviousProgramID.clear(); + Version.clear(); + NextProgramID.clear(); +} + +/*! \fn bool SamProgram::HasCommandLine(void) const + \brief Returns \c true if program record contains \@PG: CL:\ +*/ +bool SamProgram::HasCommandLine(void) const { + return (!CommandLine.empty()); +} + +/*! \fn bool SamProgram::HasID(void) const + \brief Returns \c true if program record contains \@PG: ID:\ +*/ +bool SamProgram::HasID(void) const { + return (!ID.empty()); +} + +/*! \fn bool SamProgram::HasName(void) const + \brief Returns \c true if program record contains \@PG: PN:\ +*/ +bool SamProgram::HasName(void) const { + return (!Name.empty()); +} + +/*! \fn bool SamProgram::HasNextProgramID(void) const + \internal + \return true if program has a "next" record in a SamProgramChain +*/ +bool SamProgram::HasNextProgramID(void) const { + return (!NextProgramID.empty()); +} + +/*! \fn bool SamProgram::HasPreviousProgramID(void) const + \brief Returns \c true if program record contains \@PG: PP:\ +*/ +bool SamProgram::HasPreviousProgramID(void) const { + return (!PreviousProgramID.empty()); +} + +/*! \fn bool SamProgram::HasVersion(void) const + \brief Returns \c true if program record contains \@PG: VN:\ +*/ +bool SamProgram::HasVersion(void) const { + return (!Version.empty()); +} diff --git a/src/api/SamProgram.h b/src/api/SamProgram.h new file mode 100644 index 0000000..3c89059 --- /dev/null +++ b/src/api/SamProgram.h @@ -0,0 +1,62 @@ +// *************************************************************************** +// SamProgram.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header program records. +// *************************************************************************** + +#ifndef SAM_PROGRAM_H +#define SAM_PROGRAM_H + +#include "api/api_global.h" +#include + +namespace BamTools { + +class SamProgramChain; + +struct API_EXPORT SamProgram { + + // ctor & dtor + SamProgram(void); + SamProgram(const std::string& id); + SamProgram(const SamProgram& other); + ~SamProgram(void); + + // query/modify entire program record + void Clear(void); // clears all data fields + + // convenience query methods + bool HasCommandLine(void) const; // returns true if program record has a command line entry + bool HasID(void) const; // returns true if program record has an ID + bool HasName(void) const; // returns true if program record has a name + bool HasPreviousProgramID(void) const; // returns true if program record has a 'previous program ID' + bool HasVersion(void) const; // returns true if program record has a version + + // data members + std::string CommandLine; // CL: + std::string ID; // ID: *Required for valid SAM header* + std::string Name; // PN: + std::string PreviousProgramID; // PP: + std::string Version; // VN: + + // internal (non-standard) methods & fields + private: + bool HasNextProgramID(void) const; + std::string NextProgramID; + friend class BamTools::SamProgramChain; +}; + +/*! \fn bool operator==(const SamProgram& lhs, const SamProgram& rhs) + \brief tests equality by comparing program IDs +*/ +API_EXPORT inline bool operator==(const SamProgram& lhs, const SamProgram& rhs) { + return lhs.ID == rhs.ID; +} + +} // namespace BamTools + +#endif // SAM_PROGRAM_H diff --git a/src/api/SamProgramChain.cpp b/src/api/SamProgramChain.cpp new file mode 100644 index 0000000..66b7f07 --- /dev/null +++ b/src/api/SamProgramChain.cpp @@ -0,0 +1,352 @@ +// *************************************************************************** +// SamProgramChain.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a SamProgram record "chain" +// *************************************************************************** + +#include +using namespace BamTools; + +#include +#include +#include +using namespace std; + +/*! \class BamTools::SamProgramChain + \brief Sorted container "chain" of SamProgram records. + + Provides methods for operating on a collection of SamProgram records. + + N.B. - Underlying container is *NOT* ordered by linkage, but by order of + appearance in SamHeader and subsequent Add() calls. Using the current + iterators will not allow you to step through the header's program history. + Instead use First()/Last() to access oldest/newest records, respectively. +*/ + +/*! \fn SamProgramChain::SamProgramChain(void) + \brief constructor +*/ +SamProgramChain::SamProgramChain(void) { } + +/*! \fn SamProgramChain::SamProgramChain(const SamProgramChain& other) + \brief copy constructor +*/ +SamProgramChain::SamProgramChain(const SamProgramChain& other) + : m_data(other.m_data) +{ } + +/*! \fn SamProgramChain::~SamProgramChain(void) + \brief destructor +*/ +SamProgramChain::~SamProgramChain(void) { } + +/*! \fn void SamProgramChain::Add(SamProgram& program) + \brief Appends a program to program chain. + + Duplicate entries are silently discarded. + + N.B. - Underlying container is *NOT* ordered by linkage, but by order of + appearance in SamHeader and subsequent Add() calls. Using the current + iterators will not allow you to step through the header's program history. + Instead use First()/Last() to access oldest/newest records, respectively. + + \param program entry to be appended +*/ +void SamProgramChain::Add(SamProgram& program) { + + // ignore duplicated records + if ( Contains(program) ) + return; + + // if other programs already in chain, try to find the "next" record + // tries to match another record's PPID with @program's ID + if ( !IsEmpty() ) + program.NextProgramID = NextIdFor(program.ID); + + // store program record + m_data.push_back(program); +} + +/*! \fn void SamProgramChain::Add(const std::vector& programs) + \brief Appends a batch of programs to the end of the chain. + + This is an overloaded function. + + \param programs batch of program records to append + \sa Add() +*/ +void SamProgramChain::Add(std::vector& programs) { + vector::iterator pgIter = programs.begin(); + vector::iterator pgEnd = programs.end(); + for ( ; pgIter != pgEnd; ++pgIter ) + Add(*pgIter); +} + +/*! \fn SamProgramIterator SamProgramChain::Begin(void) + \return an STL iterator pointing to the first (oldest) program record + \sa ConstBegin(), End(), First() +*/ +SamProgramIterator SamProgramChain::Begin(void) { + return m_data.begin(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::Begin(void) const + \return an STL const_iterator pointing to the first (oldest) program record + + This is an overloaded function. + + \sa ConstBegin(), End(), First() +*/ +SamProgramConstIterator SamProgramChain::Begin(void) const { + return m_data.begin(); +} + +/*! \fn void SamProgramChain::Clear(void) + \brief Clears all program records. +*/ +void SamProgramChain::Clear(void) { + m_data.clear(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::ConstBegin(void) const + \return an STL const_iterator pointing to the first (oldest) program record + \sa Begin(), ConstEnd(), First() +*/ +SamProgramConstIterator SamProgramChain::ConstBegin(void) const { + return m_data.begin(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::ConstEnd(void) const + \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record + \sa ConstBegin(), End(), Last() +*/ +SamProgramConstIterator SamProgramChain::ConstEnd(void) const { + return m_data.end(); +} + +/*! \fn bool SamProgramChain::Contains(const SamProgram& program) const + \brief Returns true if chains has this program record (matching on ID). + + This is an overloaded function. + + \param program SamProgram to search for + \return \c true if chain contains program (matching on ID) +*/ +bool SamProgramChain::Contains(const SamProgram& program) const { + return Contains(program.ID); +} + +/*! \fn bool SamProgramChain::Contains(const std::string& programId) const + \brief Returns true if chains has a program record with this ID + \param programId search for program matching this ID + \return \c true if chain contains a program record with this ID +*/ +bool SamProgramChain::Contains(const std::string& programId) const { + return ( IndexOf(programId) != (int)m_data.size() ); +} + +/*! \fn SamProgramIterator SamProgramChain::End(void) + \return an STL iterator pointing to the imaginary entry after the last (newest) program record + \sa Begin(), ConstEnd(), Last() +*/ +SamProgramIterator SamProgramChain::End(void) { + return m_data.end(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::End(void) const + \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record + + This is an overloaded function. + + \sa Begin(), ConstEnd(), Last() +*/ +SamProgramConstIterator SamProgramChain::End(void) const { + return m_data.end(); +} + +/*! \fn SamProgram& SamProgramChain::First(void) + \brief Fetches first (oldest) record in the chain. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a modifiable reference to the first (oldest) program entry + \sa Begin(), Last() +*/ +SamProgram& SamProgramChain::First(void) { + + // find first record in container that has no PreviousProgramID entry + SamProgramIterator iter = Begin(); + SamProgramIterator end = End(); + for ( ; iter != end; ++iter ) { + SamProgram& current = (*iter); + if ( !current.HasPreviousProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not find any record without a PP tag" << endl; + exit(1); +} + +/*! \fn const SamProgram& SamProgramChain::First(void) const + \brief Fetches first (oldest) record in the chain. + + This is an overloaded function. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a read-only reference to the first (oldest) program entry + \sa Begin(), ConstBegin(), Last() +*/ +const SamProgram& SamProgramChain::First(void) const { + + // find first record in container that has no PreviousProgramID entry + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( !current.HasPreviousProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not find any record without a PP tag" << endl; + exit(1); +} + +/*! \fn int SamProgramChain::IndexOf(const std::string& programId) const + \internal + \return index of program record if found. + Otherwise, returns vector::size() (invalid index). +*/ +int SamProgramChain::IndexOf(const std::string& programId) const { + SamProgramConstIterator begin = ConstBegin(); + SamProgramConstIterator iter = begin; + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( current.ID == programId ) + break; + } + return distance( begin, iter ); +} + +/*! \fn bool SamProgramChain::IsEmpty(void) const + \brief Returns \c true if chain contains no records + \sa Size() +*/ +bool SamProgramChain::IsEmpty(void) const { + return m_data.empty(); +} + +/*! \fn SamProgram& SamProgramChain::Last(void) + \brief Fetches last (newest) record in the chain. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a modifiable reference to the last (newest) program entry + \sa End(), First() +*/ +SamProgram& SamProgramChain::Last(void) { + // find first record in container that has no NextProgramID entry + SamProgramIterator iter = Begin(); + SamProgramIterator end = End(); + for ( ; iter != end; ++iter ) { + SamProgram& current = (*iter); + if ( !current.HasNextProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not determine last record" << endl; + exit(1); +} + +/*! \fn const SamProgram& SamProgramChain::Last(void) const + \brief Fetches last (newest) record in the chain. + + This is an overloaded function. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a read-only reference to the last (newest) program entry + \sa End(), ConstEnd(), First() +*/ +const SamProgram& SamProgramChain::Last(void) const { + // find first record in container that has no NextProgramID entry + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( !current.HasNextProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not determine last record" << endl; + exit(1); +} + +/*! \fn const std::string SamProgramChain::NextIdFor(const std::string& programId) const + \internal + \return ID of program record, whose PreviousProgramID matches \a programId. + Otherwise, returns empty string if none found. +*/ +const std::string SamProgramChain::NextIdFor(const std::string& programId) const { + + // find first record in container whose PreviousProgramID matches @programId + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( !current.HasPreviousProgramID() && + current.PreviousProgramID == programId + ) + { + return current.ID; + } + } + + // none found + return string(); +} + +/*! \fn int SamProgramChain::Size(void) const + \brief Returns number of program records in the chain. + \sa IsEmpty() +*/ +int SamProgramChain::Size(void) const { + return m_data.size(); +} + +/*! \fn SamProgram& SamProgramChain::operator[](const std::string& programId) + \brief Retrieves the modifiable SamProgram record that matches \a programId. + + NOTE - If the chain contains no read group matching this ID, this function will + print an error and terminate. + + \param programId ID of program record to retrieve + \return a modifiable reference to the SamProgram associated with the ID +*/ +SamProgram& SamProgramChain::operator[](const std::string& programId) { + + // look up program record matching this ID + int index = IndexOf(programId); + + // if record not found + if ( index == (int)m_data.size() ) { + cerr << "SamProgramChain ERROR - unknown programId: " << programId << endl; + exit(1); + } + + // otherwise return program record at index + return m_data.at(index); +} diff --git a/src/api/SamProgramChain.h b/src/api/SamProgramChain.h new file mode 100644 index 0000000..4cb16fc --- /dev/null +++ b/src/api/SamProgramChain.h @@ -0,0 +1,86 @@ +// *************************************************************************** +// SamProgramChain.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a SamProgram record "chain" +// *************************************************************************** + +#ifndef SAM_PROGRAMCHAIN_H +#define SAM_PROGRAMCHAIN_H + +#include +#include +#include +#include + +namespace BamTools { + +// chain is *NOT* sorted in any order +// use First()/Last() to retrieve oldest/newest programs, respectively +typedef std::vector SamProgramContainer; +typedef SamProgramContainer::iterator SamProgramIterator; +typedef SamProgramContainer::const_iterator SamProgramConstIterator; + +class API_EXPORT SamProgramChain { + + // ctor & dtor + public: + SamProgramChain(void); + SamProgramChain(const SamProgramChain& other); + ~SamProgramChain(void); + + // query/modify program data + public: + // appends a program record to the chain + void Add(SamProgram& program); + void Add(std::vector& programs); + + // clears all read group entries + void Clear(void); + + // returns true if chain contains this program record (matches on ID) + bool Contains(const SamProgram& program) const; + bool Contains(const std::string& programId) const; + + // returns the first (oldest) program in the chain + SamProgram& First(void); + const SamProgram& First(void) const; + + // returns true if chain is empty + bool IsEmpty(void) const; + + // returns last (most recent) program in the chain + SamProgram& Last(void); + const SamProgram& Last(void) const; + + // returns number of program records in the chain + int Size(void) const; + + // retrieves a modifiable reference to the SamProgram object associated with this ID + SamProgram& operator[](const std::string& programId); + + // retrieve STL-compatible iterators + public: + SamProgramIterator Begin(void); // returns iterator to begin() + SamProgramConstIterator Begin(void) const; // returns const_iterator to begin() + SamProgramConstIterator ConstBegin(void) const; // returns const_iterator to begin() + SamProgramIterator End(void); // returns iterator to end() + SamProgramConstIterator End(void) const; // returns const_iterator to end() + SamProgramConstIterator ConstEnd(void) const; // returns const_iterator to end() + + // internal methods + private: + int IndexOf(const std::string& programId) const; + const std::string NextIdFor(const std::string& programId) const; + + // data members + private: + SamProgramContainer m_data; +}; + +} // namespace BamTools + +#endif // SAM_PROGRAMCHAIN_H diff --git a/src/api/SamReadGroup.cpp b/src/api/SamReadGroup.cpp index da50d08..2ba75f1 100644 --- a/src/api/SamReadGroup.cpp +++ b/src/api/SamReadGroup.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM read group data fields. // *************************************************************************** @@ -17,32 +17,43 @@ using namespace std; Provides direct read/write access to the SAM read group data fields. - \sa http://samtools.sourceforge.net/SAM-1.3.pdf + \sa \samSpecURL +*/ +/*! \var SamReadGroup::Description + \brief corresponds to \@RG DS:\ +*/ +/*! \var SamReadGroup::FlowOrder + \brief corresponds to \@RG FO:\ */ /*! \var SamReadGroup::ID \brief corresponds to \@RG ID:\ + + Required for valid SAM header. */ -/*! \var SamReadGroup::Sample - \brief corresponds to \@RG SM:\ +/*! \var SamReadGroup::KeySequence + \brief corresponds to \@RG KS:\ */ /*! \var SamReadGroup::Library \brief corresponds to \@RG LB:\ */ -/*! \var SamReadGroup::Description - \brief corresponds to \@RG DS:\ -*/ /*! \var SamReadGroup::PlatformUnit \brief corresponds to \@RG PU:\ */ /*! \var SamReadGroup::PredictedInsertSize \brief corresponds to \@RG PI:\ */ -/*! \var SamReadGroup::SequencingCenter - \brief corresponds to \@RG CN:\ -*/ /*! \var SamReadGroup::ProductionDate \brief corresponds to \@RG DT:\ */ +/*! \var SamReadGroup::Program + \brief corresponds to \@RG PG:\ +*/ +/*! \var SamReadGroup::Sample + \brief corresponds to \@RG SM:\ +*/ +/*! \var SamReadGroup::SequencingCenter + \brief corresponds to \@RG CN:\ +*/ /*! \var SamReadGroup::SequencingTechnology \brief corresponds to \@RG PL:\ */ @@ -51,14 +62,17 @@ using namespace std; \brief default constructor */ SamReadGroup::SamReadGroup(void) - : ID("") - , Sample("") + : Description("") + , FlowOrder("") + , ID("") + , KeySequence("") , Library("") - , Description("") , PlatformUnit("") , PredictedInsertSize("") - , SequencingCenter("") , ProductionDate("") + , Program("") + , Sample("") + , SequencingCenter("") , SequencingTechnology("") { } @@ -68,14 +82,17 @@ SamReadGroup::SamReadGroup(void) \param id desired read group ID */ SamReadGroup::SamReadGroup(const std::string& id) - : ID(id) - , Sample("") + : Description("") + , FlowOrder("") + , ID(id) + , KeySequence("") , Library("") - , Description("") , PlatformUnit("") , PredictedInsertSize("") - , SequencingCenter("") , ProductionDate("") + , Program("") + , Sample("") + , SequencingCenter("") , SequencingTechnology("") { } @@ -83,14 +100,17 @@ SamReadGroup::SamReadGroup(const std::string& id) \brief copy constructor */ SamReadGroup::SamReadGroup(const SamReadGroup& other) - : ID(other.ID) - , Sample(other.Sample) + : Description(other.Description) + , FlowOrder(other.FlowOrder) + , ID(other.ID) + , KeySequence(other.KeySequence) , Library(other.Library) - , Description(other.Description) , PlatformUnit(other.PlatformUnit) , PredictedInsertSize(other.PredictedInsertSize) - , SequencingCenter(other.SequencingCenter) , ProductionDate(other.ProductionDate) + , Program(other.Program) + , Sample(other.Sample) + , SequencingCenter(other.SequencingCenter) , SequencingTechnology(other.SequencingTechnology) { } @@ -103,17 +123,34 @@ SamReadGroup::~SamReadGroup(void) { } \brief Clears all data fields. */ void SamReadGroup::Clear(void) { + Description.clear(); + FlowOrder.clear(); ID.clear(); - Sample.clear(); + KeySequence.clear(); Library.clear(); - Description.clear(); PlatformUnit.clear(); PredictedInsertSize.clear(); - SequencingCenter.clear(); ProductionDate.clear(); + Program.clear(); + Sample.clear(); + SequencingCenter.clear(); SequencingTechnology.clear(); } +/*! \fn bool SamReadGroup::HasDescription(void) const + \brief Returns \c true if read group contains \@RG DS:\ +*/ +bool SamReadGroup::HasDescription(void) const { + return (!Description.empty()); +} + +/*! \fn bool SamReadGroup::HasFlowOrder(void) const + \brief Returns \c true if read group contains \@RG FO:\ +*/ +bool SamReadGroup::HasFlowOrder(void) const { + return (!FlowOrder.empty()); +} + /*! \fn bool SamReadGroup::HasID(void) const \brief Returns \c true if read group contains \@RG: ID:\ */ @@ -121,11 +158,11 @@ bool SamReadGroup::HasID(void) const { return (!ID.empty()); } -/*! \fn bool SamReadGroup::HasSample(void) const - \brief Returns \c true if read group contains \@RG SM:\ +/*! \fn bool SamReadGroup::HasKeySequence(void) const + \brief Returns \c true if read group contains \@RG KS:\ */ -bool SamReadGroup::HasSample(void) const { - return (!Sample.empty()); +bool SamReadGroup::HasKeySequence(void) const { + return (!KeySequence.empty()); } /*! \fn bool SamReadGroup::HasLibrary(void) const @@ -135,13 +172,6 @@ bool SamReadGroup::HasLibrary(void) const { return (!Library.empty()); } -/*! \fn bool SamReadGroup::HasDescription(void) const - \brief Returns \c true if read group contains \@RG DS:\ -*/ -bool SamReadGroup::HasDescription(void) const { - return (!Description.empty()); -} - /*! \fn bool SamReadGroup::HasPlatformUnit(void) const \brief Returns \c true if read group contains \@RG PU:\ */ @@ -156,13 +186,6 @@ bool SamReadGroup::HasPredictedInsertSize(void) const { return (!PredictedInsertSize.empty()); } -/*! \fn bool SamReadGroup::HasSequencingCenter(void) const - \brief Returns \c true if read group contains \@RG CN:\ -*/ -bool SamReadGroup::HasSequencingCenter(void) const { - return (!SequencingCenter.empty()); -} - /*! \fn bool SamReadGroup::HasProductionDate(void) const \brief Returns \c true if read group contains \@RG DT:\ */ @@ -170,6 +193,27 @@ bool SamReadGroup::HasProductionDate(void) const { return (!ProductionDate.empty()); } +/*! \fn bool SamReadGroup::HasProgram(void) const + \brief Returns \c true if read group contains \@RG PG:\ +*/ +bool SamReadGroup::HasProgram(void) const { + return (!Program.empty()); +} + +/*! \fn bool SamReadGroup::HasSample(void) const + \brief Returns \c true if read group contains \@RG SM:\ +*/ +bool SamReadGroup::HasSample(void) const { + return (!Sample.empty()); +} + +/*! \fn bool SamReadGroup::HasSequencingCenter(void) const + \brief Returns \c true if read group contains \@RG CN:\ +*/ +bool SamReadGroup::HasSequencingCenter(void) const { + return (!SequencingCenter.empty()); +} + /*! \fn bool SamReadGroup::HasSequencingTechnology(void) const \brief Returns \c true if read group contains \@RG PL:\ */ diff --git a/src/api/SamReadGroup.h b/src/api/SamReadGroup.h index 538617d..b203d3c 100644 --- a/src/api/SamReadGroup.h +++ b/src/api/SamReadGroup.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM read group data fields. // *************************************************************************** @@ -28,26 +28,33 @@ struct API_EXPORT SamReadGroup { void Clear(void); // clears all data fields // convenience query methods + bool HasDescription(void) const; // returns true if read group has a description + bool HasFlowOrder(void) const; // returns true if read group has a flow order entry bool HasID(void) const; // returns true if read group has a group ID - bool HasSample(void) const; // returns true if read group has a sample name + bool HasKeySequence(void) const; // returns true if read group has a key sequence bool HasLibrary(void) const; // returns true if read group has a library name - bool HasDescription(void) const; // returns true if read group has a description bool HasPlatformUnit(void) const; // returns true if read group has a platform unit ID bool HasPredictedInsertSize(void) const; // returns true if read group has a predicted insert size - bool HasSequencingCenter(void) const; // returns true if read group has a sequencing center ID bool HasProductionDate(void) const; // returns true if read group has a production date + bool HasProgram(void) const; // returns true if read group has a program entry + bool HasSample(void) const; // returns true if read group has a sample name + bool HasSequencingCenter(void) const; // returns true if read group has a sequencing center ID bool HasSequencingTechnology(void) const; // returns true if read group has a sequencing technology ID - // data members - std::string ID; // ID: - std::string Sample; // SM: - std::string Library; // LB: - std::string Description; // DS: - std::string PlatformUnit; // PU: - std::string PredictedInsertSize; // PI: - std::string SequencingCenter; // CN: - std::string ProductionDate; // DT: - std::string SequencingTechnology; // PL: + + // data fields + std::string Description; // DS: + std::string FlowOrder; // FO: + std::string ID; // ID: *Required for valid SAM header* + std::string KeySequence; // KS: + std::string Library; // LB: + std::string PlatformUnit; // PU: + std::string PredictedInsertSize; // PI: + std::string ProductionDate; // DT: + std::string Program; // PG: + std::string Sample; // SM: + std::string SequencingCenter; // CN: + std::string SequencingTechnology; // PL: }; /*! \fn bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) diff --git a/src/api/SamReadGroupDictionary.cpp b/src/api/SamReadGroupDictionary.cpp index e6f8a05..69903ff 100644 --- a/src/api/SamReadGroupDictionary.cpp +++ b/src/api/SamReadGroupDictionary.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamReadGroup entries. // *************************************************************************** @@ -41,11 +41,14 @@ SamReadGroupDictionary::~SamReadGroupDictionary(void) { } /*! \fn void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) \brief Adds a read group to the dictionary. - Duplicate entries are discarded. + Duplicate entries are silently discarded. \param readGroup entry to be added */ void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) { + + // TODO: report error on attempted duplicate? + if ( IsEmpty() || !Contains(readGroup) ) m_data.push_back(readGroup); } @@ -101,10 +104,10 @@ SamReadGroupIterator SamReadGroupDictionary::Begin(void) { } /*! \fn SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const + \return an STL const_iterator pointing to the first read group This is an overloaded function. - \return a const STL iterator pointing to the first read group \sa ConstBegin(), End() */ SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const { @@ -119,7 +122,7 @@ void SamReadGroupDictionary::Clear(void) { } /*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const - \return a const STL iterator pointing to the first read group + \return an STL const_iterator pointing to the first read group \sa Begin(), ConstEnd() */ SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const { @@ -127,7 +130,7 @@ SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const { } /*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const - \return a const STL iterator pointing to the imaginary entry after the last read group + \return an STL const_iterator pointing to the imaginary entry after the last read group \sa ConstBegin(), End() */ SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const { @@ -136,23 +139,23 @@ SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const { /*! \fn bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const \brief Returns true if dictionary contains read group. - - This is an overloaded function. - \param readGroupId search for read group matching this ID \return \c true if dictionary contains a read group with this ID */ bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const { - return Contains( SamReadGroup(readGroupId) ); + return ( IndexOf(readGroupId) != (int)m_data.size() ); } /*! \fn bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const - \brief Returns true if dictionary contains read group. + \brief Returns true if dictionary contains read group (matching on ID). + + This is an overloaded function. + \param readGroup search for this read group - \return \c true if dictionary contains read group + \return \c true if dictionary contains read group (matching on ID). */ bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const { - return ( IndexOf(readGroup) != (int)m_data.size() ); + return Contains( readGroup.ID ); } /*! \fn SamReadGroupIterator SamReadGroupDictionary::End(void) @@ -164,26 +167,27 @@ SamReadGroupIterator SamReadGroupDictionary::End(void) { } /*! \fn SamReadGroupConstIterator SamReadGroupDictionary::End(void) const + \return an STL const_iterator pointing to the imaginary entry after the last read group This is an overloaded function. - \return a const STL iterator pointing to the imaginary entry after the last read group \sa Begin(), ConstEnd() */ SamReadGroupConstIterator SamReadGroupDictionary::End(void) const { return m_data.end(); } -/*! \fn int SamReadGroupDictionary::IndexOf(const SamReadGroup& readGroup) const +/*! \fn int SamReadGroupDictionary::IndexOf(const std::string& readGroupId) const \internal \return index of read group if found. Otherwise, returns vector::size() (invalid index). */ -int SamReadGroupDictionary::IndexOf(const SamReadGroup& readGroup) const { +int SamReadGroupDictionary::IndexOf(const std::string& readGroupId) const { SamReadGroupConstIterator begin = ConstBegin(); SamReadGroupConstIterator iter = begin; SamReadGroupConstIterator end = ConstEnd(); for ( ; iter != end; ++iter ) { - if ( *iter == readGroup ) + const SamReadGroup& current = (*iter); + if ( current.ID == readGroupId ) break; } return distance( begin, iter ); @@ -198,28 +202,28 @@ bool SamReadGroupDictionary::IsEmpty(void) const { } /*! \fn void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) - \brief Removes read group from dictionary, if found. - \param readGroup read group to remove + \brief Removes read group from dictionary, if found (matching on ID). + + This is an overloaded function. + + \param readGroup read group to remove (matches on ID) */ void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) { - if ( Contains(readGroup) ) - m_data.erase( m_data.begin() + IndexOf(readGroup) ); + Remove( readGroup.ID ); } /*! \fn void SamReadGroupDictionary::Remove(const std::string& readGroupId) \brief Removes read group from dictionary, if found. - - This is an overloaded function. - \param readGroupId ID of read group to remove \sa Remove() */ void SamReadGroupDictionary::Remove(const std::string& readGroupId) { - Remove( SamReadGroup(readGroupId) ); + if ( Contains(readGroupId) ) + m_data.erase( m_data.begin() + IndexOf(readGroupId) ); } /*! \fn void SamReadGroupDictionary::Remove(const std::vector& readGroups) - \brief Removes multiple read groups from dictionary. + \brief Removes multiple read groups from dictionary (matching on ID). This is an overloaded function. @@ -271,7 +275,7 @@ int SamReadGroupDictionary::Size(void) const { SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) { // look up read group ID - int index = IndexOf( SamReadGroup(readGroupId) ); + int index = IndexOf(readGroupId); // if found, return read group at index if ( index != (int)m_data.size() ) diff --git a/src/api/SamReadGroupDictionary.h b/src/api/SamReadGroupDictionary.h index 75df199..8ec40e2 100644 --- a/src/api/SamReadGroupDictionary.h +++ b/src/api/SamReadGroupDictionary.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamReadGroup entries. // *************************************************************************** @@ -75,7 +75,7 @@ class API_EXPORT SamReadGroupDictionary { // internal methods private: - int IndexOf(const SamReadGroup& readGroup) const; + int IndexOf(const std::string& readGroupId) const; // data members private: @@ -84,4 +84,4 @@ class API_EXPORT SamReadGroupDictionary { } // namespace BamTools -#endif // SAM_READGROUP_DICTIONARY +#endif // SAM_READGROUP_DICTIONARY_H diff --git a/src/api/SamSequence.cpp b/src/api/SamSequence.cpp index 869c24d..0231988 100644 --- a/src/api/SamSequence.cpp +++ b/src/api/SamSequence.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 20 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM sequence data fields. // *************************************************************************** @@ -18,13 +18,7 @@ using namespace std; Provides direct read/write access to the SAM sequence data fields. - \sa http://samtools.sourceforge.net/SAM-1.3.pdf -*/ -/*! \var SamSequence::Name - \brief corresponds to \@SQ SN:\ -*/ -/*! \var SamSequence::Length - \brief corresponds to \@SQ LN:\ + \sa \samSpecURL */ /*! \var SamSequence::AssemblyID \brief corresponds to \@SQ AS:\ @@ -32,53 +26,80 @@ using namespace std; /*! \var SamSequence::Checksum \brief corresponds to \@SQ M5:\ */ -/*! \var SamSequence::URI - \brief corresponds to \@SQ UR:\ +/*! \var SamSequence::Length + \brief corresponds to \@SQ LN:\ + + Required for valid SAM header. +*/ +/*! \var SamSequence::Name + \brief corresponds to \@SQ SN:\ + + Required for valid SAM header. */ /*! \var SamSequence::Species \brief corresponds to \@SQ SP:\ */ +/*! \var SamSequence::URI + \brief corresponds to \@SQ UR:\ +*/ /*! \fn SamSequence::SamSequence(void) \brief default constructor */ SamSequence::SamSequence(void) - : Name("") - , Length("") - , AssemblyID("") + : AssemblyID("") , Checksum("") - , URI("") + , Length("") + , Name("") , Species("") + , URI("") { } /*! \fn SamSequence::SamSequence(const std::string& name, const int& length) \brief constructs sequence with \a name and \a length - \param name desired sequence name + \param name desired sequence name \param length desired sequence length (numeric value) */ -SamSequence::SamSequence(const std::string& name, const int& length) - : Name(name) - , AssemblyID("") +SamSequence::SamSequence(const std::string& name, + const int& length) + : AssemblyID("") , Checksum("") - , URI("") + , Name(name) , Species("") + , URI("") { stringstream s(""); s << length; Length = s.str(); } +/*! \fn SamSequence::SamSequence(const std::string& name, const std::string& length) + \brief constructs sequence with \a name and \a length + + \param name desired sequence name + \param length desired sequence length (string value) +*/ +SamSequence::SamSequence(const std::string& name, + const std::string& length) + : AssemblyID("") + , Checksum("") + , Length(length) + , Name(name) + , Species("") + , URI("") +{ } + /*! \fn SamSequence::SamSequence(const SamSequence& other) \brief copy constructor */ SamSequence::SamSequence(const SamSequence& other) - : Name(other.Name) - , Length(other.Length) - , AssemblyID(other.AssemblyID) + : AssemblyID(other.AssemblyID) , Checksum(other.Checksum) - , URI(other.URI) + , Length(other.Length) + , Name(other.Name) , Species(other.Species) + , URI(other.URI) { } /*! \fn SamSequence::~SamSequence(void) @@ -90,26 +111,12 @@ SamSequence::~SamSequence(void) { } \brief Clears all data fields. */ void SamSequence::Clear(void) { - Name.clear(); - Length.clear(); AssemblyID.clear(); Checksum.clear(); - URI.clear(); + Length.clear(); + Name.clear(); Species.clear(); -} - -/*! \fn bool SamSequence::HasName(void) const - \brief Returns \c true if sequence contains \@SQ SN:\ -*/ -bool SamSequence::HasName(void) const { - return (!Name.empty()); -} - -/*! \fn bool SamSequence::HasLength(void) const - \brief Returns \c true if sequence contains \@SQ LN:\ -*/ -bool SamSequence::HasLength(void) const { - return (!Length.empty()); + URI.clear(); } /*! \fn bool SamSequence::HasAssemblyID(void) const @@ -126,11 +133,18 @@ bool SamSequence::HasChecksum(void) const { return (!Checksum.empty()); } -/*! \fn bool SamSequence::HasURI(void) const - \brief Returns \c true if sequence contains \@SQ UR:\ +/*! \fn bool SamSequence::HasLength(void) const + \brief Returns \c true if sequence contains \@SQ LN:\ */ -bool SamSequence::HasURI(void) const { - return (!URI.empty()); +bool SamSequence::HasLength(void) const { + return (!Length.empty()); +} + +/*! \fn bool SamSequence::HasName(void) const + \brief Returns \c true if sequence contains \@SQ SN:\ +*/ +bool SamSequence::HasName(void) const { + return (!Name.empty()); } /*! \fn bool SamSequence::HasSpecies(void) const @@ -139,3 +153,10 @@ bool SamSequence::HasURI(void) const { bool SamSequence::HasSpecies(void) const { return (!Species.empty()); } + +/*! \fn bool SamSequence::HasURI(void) const + \brief Returns \c true if sequence contains \@SQ UR:\ +*/ +bool SamSequence::HasURI(void) const { + return (!URI.empty()); +} diff --git a/src/api/SamSequence.h b/src/api/SamSequence.h index fea09d3..054e58f 100644 --- a/src/api/SamSequence.h +++ b/src/api/SamSequence.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM sequence data fields. // *************************************************************************** @@ -21,27 +21,28 @@ struct API_EXPORT SamSequence { // ctor & dtor SamSequence(void); SamSequence(const std::string& name, const int& length); + SamSequence(const std::string& name, const std::string& length); SamSequence(const SamSequence& other); ~SamSequence(void); // query/modify entire sequence - void Clear(void); // clears all contents + void Clear(void); // clears all contents // convenience query methods - bool HasName(void) const; // returns true if sequence has a name - bool HasLength(void) const; // returns true if sequence has a length - bool HasAssemblyID(void) const; // returns true if sequence has an assembly ID - bool HasChecksum(void) const; // returns true if sequence has an MD5 checksum - bool HasURI(void) const; // returns true if sequence has a URI - bool HasSpecies(void) const; // returns true if sequence has a species ID + bool HasAssemblyID(void) const; // returns true if sequence has an assembly ID + bool HasChecksum(void) const; // returns true if sequence has an MD5 checksum + bool HasLength(void) const; // returns true if sequence has a length + bool HasName(void) const; // returns true if sequence has a name + bool HasSpecies(void) const; // returns true if sequence has a species ID + bool HasURI(void) const; // returns true if sequence has a URI // data members - std::string Name; // SN: - std::string Length; // LN: - std::string AssemblyID; // AS: - std::string Checksum; // M5: - std::string URI; // UR: - std::string Species; // SP: + std::string AssemblyID; // AS: + std::string Checksum; // M5: + std::string Length; // LN: *Required for valid SAM header* + std::string Name; // SN: *Required for valid SAM header* + std::string Species; // SP: + std::string URI; // UR: }; /*! \fn bool operator==(const SamSequence& lhs, const SamSequence& rhs) diff --git a/src/api/SamSequenceDictionary.cpp b/src/api/SamSequenceDictionary.cpp index 3249bd4..3e5240d 100644 --- a/src/api/SamSequenceDictionary.cpp +++ b/src/api/SamSequenceDictionary.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 20 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamSequence entries. // ************************************************************************* @@ -40,11 +40,14 @@ SamSequenceDictionary::~SamSequenceDictionary(void) { } /*! \fn void SamSequenceDictionary::Add(const SamSequence& sequence) \brief Adds a sequence to the dictionary. - Duplicate entries are discarded. + Duplicate entries are silently discarded. \param sequence entry to be added */ void SamSequenceDictionary::Add(const SamSequence& sequence) { + + // TODO: report error on attempted duplicate? + if ( IsEmpty() || !Contains(sequence) ) m_data.push_back(sequence); } @@ -104,10 +107,10 @@ SamSequenceIterator SamSequenceDictionary::Begin(void) { } /*! \fn SamSequenceConstIterator SamSequenceDictionary::Begin(void) const + \return an STL const_iterator pointing to the first sequence This is an overloaded function. - \return a const STL iterator pointing to the first sequence \sa ConstBegin(), End() */ SamSequenceConstIterator SamSequenceDictionary::Begin(void) const { @@ -122,7 +125,7 @@ void SamSequenceDictionary::Clear(void) { } /*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const - \return a const STL iterator pointing to the first sequence + \return an STL const_iterator pointing to the first sequence \sa Begin(), ConstEnd() */ SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const { @@ -130,7 +133,7 @@ SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const { } /*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const - \return a const STL iterator pointing to the imaginary entry after the last sequence + \return an STL const_iterator pointing to the imaginary entry after the last sequence \sa End(), ConstBegin() */ SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const { @@ -147,12 +150,15 @@ bool SamSequenceDictionary::Contains(const std::string& sequenceName) const { } /*! \fn bool SamSequenceDictionary::Contains(const SamSequence& sequence) const - \brief Returns true if dictionary contains sequence. + \brief Returns true if dictionary contains sequence (matches on name). + + This is an overloaded function. + \param sequence search for this sequence - \return \c true if dictionary contains sequence + \return \c true if dictionary contains sequence (matching on name) */ bool SamSequenceDictionary::Contains(const SamSequence& sequence) const { - return ( IndexOf(sequence) != (int)m_data.size() ); + return ( IndexOf(sequence.Name) != (int)m_data.size() ); } /*! \fn SamSequenceIterator SamSequenceDictionary::End(void) @@ -164,41 +170,19 @@ SamSequenceIterator SamSequenceDictionary::End(void) { } /*! \fn SamSequenceConstIterator SamSequenceDictionary::End(void) const + \return an STL const_iterator pointing to the imaginary entry after the last sequence This is an overloaded function. - \return a const STL iterator pointing to the imaginary entry after the last sequence \sa Begin(), ConstEnd() */ SamSequenceConstIterator SamSequenceDictionary::End(void) const { return m_data.end(); } -/*! \fn int SamSequenceDictionary::IndexOf(const SamSequence& sequence) const - \internal - - Uses operator==(SamSequence, SamSequence) - - \return index of sequence if found. Otherwise, returns vector::size() (invalid index). -*/ -int SamSequenceDictionary::IndexOf(const SamSequence& sequence) const { - SamSequenceConstIterator begin = ConstBegin(); - SamSequenceConstIterator iter = begin; - SamSequenceConstIterator end = ConstEnd(); - for ( ; iter != end; ++iter ) { - const SamSequence& currentSeq = (*iter); - if ( currentSeq == sequence ) - break; - } - return distance( begin, iter ); -} - /*! \fn int SamSequenceDictionary::IndexOf(const std::string& name) const \internal - - Use comparison of SamSequence::Name to \a name - - \return index of sequence if found. Otherwise, returns vector::size() (invalid index). + \return index of sequence if found (matching on name). Otherwise, returns vector::size() (invalid index). */ int SamSequenceDictionary::IndexOf(const std::string& name) const { SamSequenceConstIterator begin = ConstBegin(); @@ -221,12 +205,14 @@ bool SamSequenceDictionary::IsEmpty(void) const { } /*! \fn void SamSequenceDictionary::Remove(const SamSequence& sequence) - \brief Removes sequence from dictionary, if found. - \param sequence sequence to remove + \brief Removes sequence from dictionary, if found (matches on name). + + This is an overloaded function. + + \param sequence SamSequence to remove (matching on name) */ void SamSequenceDictionary::Remove(const SamSequence& sequence) { - if ( Contains(sequence) ) - m_data.erase( m_data.begin() + IndexOf(sequence) ); + Remove( sequence.Name ); } /*! \fn void SamSequenceDictionary::Remove(const std::string& sequenceName) @@ -282,7 +268,7 @@ int SamSequenceDictionary::Size(void) const { \brief Retrieves the modifiable SamSequence that matches \a sequenceName. NOTE - If the dictionary contains no sequence matching this name, this function inserts - a new one with this name, and returns a reference to it. + a new one with this name (length:0), and returns a reference to it. If you want to avoid this insertion behavior, check the result of Contains() before using this operator. diff --git a/src/api/SamSequenceDictionary.h b/src/api/SamSequenceDictionary.h index fca8b22..1ac7326 100644 --- a/src/api/SamSequenceDictionary.h +++ b/src/api/SamSequenceDictionary.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 20 March 2011 +// Last modified: 18 April 2011 // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamSequence entries. // *************************************************************************** @@ -76,7 +76,6 @@ class API_EXPORT SamSequenceDictionary { // internal methods private: - int IndexOf(const SamSequence& sequence) const; int IndexOf(const std::string& name) const; // data members @@ -86,5 +85,5 @@ class API_EXPORT SamSequenceDictionary { } // namespace BamTools -#endif // SAM_SEQUENCE_DICTIONARY +#endif // SAM_SEQUENCE_DICTIONARY_H diff --git a/src/api/internal/BamWriter_p.cpp b/src/api/internal/BamWriter_p.cpp index 7147a33..8f6c30d 100644 --- a/src/api/internal/BamWriter_p.cpp +++ b/src/api/internal/BamWriter_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -70,6 +70,8 @@ void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break; case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break; case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break; + case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break; + case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break; default: fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type); exit(1); @@ -241,8 +243,10 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { if ( m_isBigEndian ) { char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); memcpy(cigarData, packedCigar.data(), packedCigarLength); - for (unsigned int i = 0; i < packedCigarLength; ++i) - if (m_isBigEndian) BamTools::SwapEndian_32p(&cigarData[i]); + if ( m_isBigEndian ) { + for ( unsigned int i = 0; i < packedCigarLength; ++i ) + BamTools::SwapEndian_32p(&cigarData[i]); + } m_stream.Write(cigarData, packedCigarLength); free(cigarData); } @@ -254,7 +258,7 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { // write the base qualities char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) + for ( unsigned int i = 0; i < queryLength; i++ ) pBaseQualities[i] -= 33; // FASTQ conversion m_stream.Write(pBaseQualities, queryLength); @@ -295,11 +299,56 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { case(Constants::BAM_TAG_TYPE_HEX) : case(Constants::BAM_TAG_TYPE_STRING) : // no endian swapping necessary for hex-string/string data - while (tagData[i]) { ++i; } + while ( tagData[i] ) + ++i; // increment one more for null terminator ++i; break; + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + // error case + fprintf(stderr, + "BamWriter ERROR: unknown binary array type encountered: [%c]\n", + arrayType); + exit(1); + } + } + + break; + } + default : fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here free(tagData); @@ -368,6 +417,6 @@ void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) { m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT); // write the SAM header text - if (samHeaderLen > 0) + if ( samHeaderLen > 0 ) m_stream.Write(samHeaderText.data(), samHeaderLen); } diff --git a/src/api/internal/SamFormatParser_p.cpp b/src/api/internal/SamFormatParser_p.cpp index 02e9889..316f75f 100644 --- a/src/api/internal/SamFormatParser_p.cpp +++ b/src/api/internal/SamFormatParser_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for parsing SAM header text into SamHeader object // *************************************************************************** @@ -39,7 +39,6 @@ void SamFormatParser::Parse(const string& headerText) { string headerLine(""); while ( getline(headerStream, headerLine) ) ParseSamLine(headerLine); - return; } void SamFormatParser::ParseSamLine(const string& line) { @@ -57,8 +56,6 @@ void SamFormatParser::ParseSamLine(const string& line) { else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine); else cerr << "SamFormatParser ERROR: unknown token: " << firstToken << endl; - - return; } void SamFormatParser::ParseHDLine(const string& line) { @@ -77,17 +74,15 @@ void SamFormatParser::ParseHDLine(const string& line) { // set header contents if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue; - else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue; + else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; else cerr << "SamFormatParser ERROR: unknown HD tag: " << tokenTag << endl; } // if @HD line exists, VN must be provided - if ( !m_header.HasVersion() ) { + if ( !m_header.HasVersion() ) cerr << "SamFormatParser ERROR: @HD line is missing VN tag" << endl; - return; - } } void SamFormatParser::ParseSQLine(const string& line) { @@ -110,27 +105,30 @@ void SamFormatParser::ParseSQLine(const string& line) { if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue; else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue; else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue; - else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue; else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue; else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue; else cerr << "SamFormatParser ERROR: unknown SQ tag: " << tokenTag << endl; } + bool isMissingRequiredFields = false; + // if @SQ line exists, SN must be provided if ( !seq.HasName() ) { + isMissingRequiredFields = true; cerr << "SamFormatParser ERROR: @SQ line is missing SN tag" << endl; - return; } // if @SQ line exists, LN must be provided if ( !seq.HasLength() ) { + isMissingRequiredFields = true; cerr << "SamFormatParser ERROR: @SQ line is missing LN tag" << endl; - return; } // store SAM sequence entry - m_header.Sequences.Add(seq); + if ( !isMissingRequiredFields ) + m_header.Sequences.Add(seq); } void SamFormatParser::ParseRGLine(const string& line) { @@ -151,36 +149,38 @@ void SamFormatParser::ParseRGLine(const string& line) { // set read group contents if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue; - else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue; - else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue; else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue; + else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue; + else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue; + else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue; else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue; else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue; - else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue; else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue; else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue; else cerr << "SamFormatParser ERROR: unknown RG tag: " << tokenTag << endl; } + bool isMissingRequiredFields = false; + // if @RG line exists, ID must be provided if ( !rg.HasID() ) { + isMissingRequiredFields = true; cerr << "SamFormatParser ERROR: @RG line is missing ID tag" << endl; - return; - } - - // if @RG line exists, SM must be provided - if ( !rg.HasSample() ) { - cerr << "SamFormatParser ERROR: @RG line is missing SM tag" << endl; - return; } // store SAM read group entry - m_header.ReadGroups.Add(rg); + if ( !isMissingRequiredFields ) + m_header.ReadGroups.Add(rg); } void SamFormatParser::ParsePGLine(const string& line) { + SamProgram pg; + // split string into tokens vector tokens = Split(line, Constants::SAM_TAB); @@ -193,19 +193,27 @@ void SamFormatParser::ParsePGLine(const string& line) { const string tokenTag = (*tokenIter).substr(0,2); const string tokenValue = (*tokenIter).substr(3); - // set header contents - if ( tokenTag == Constants::SAM_PG_NAME_TAG ) m_header.ProgramName = tokenValue; - else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) m_header.ProgramVersion = tokenValue; - else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) m_header.ProgramCommandLine = tokenValue; + // set program record contents + if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue; + else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue; + else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue; + else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue; + else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue; else cerr << "SamFormatParser ERROR: unknown PG tag: " << tokenTag << endl; } + bool isMissingRequiredFields = false; + // if @PG line exists, ID must be provided - if ( !m_header.HasProgramName() ) { - cerr << "SamFormatParser ERROR:- @PG line is missing ID tag" << endl; - return; + if ( !pg.HasID() ) { + isMissingRequiredFields = true; + cerr << "SamFormatParser ERROR: @PG line is missing ID tag" << endl; } + + // store SAM program record + if ( !isMissingRequiredFields ) + m_header.Programs.Add(pg); } void SamFormatParser::ParseCOLine(const string& line) { diff --git a/src/api/internal/SamFormatPrinter_p.cpp b/src/api/internal/SamFormatPrinter_p.cpp index 69c78df..1e670b0 100644 --- a/src/api/internal/SamFormatPrinter_p.cpp +++ b/src/api/internal/SamFormatPrinter_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 19 April 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for printing formatted SAM header to string // *************************************************************************** @@ -88,14 +88,14 @@ void SamFormatPrinter::PrintSQ(std::stringstream& out) const { if ( seq.HasChecksum() ) out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum); - // UR: - if ( seq.HasURI() ) - out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI); - // SP: if ( seq.HasSpecies() ) out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species); + // UR: + if ( seq.HasURI() ) + out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI); + // newline out << endl; } @@ -109,39 +109,54 @@ void SamFormatPrinter::PrintRG(std::stringstream& out) const { for ( ; rgIter != rgEnd; ++rgIter ) { const SamReadGroup& rg = (*rgIter); - // @RG ID: SM: + // @RG ID: out << Constants::SAM_RG_BEGIN_TOKEN - << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID) - << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample); + << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID); - // LB: - if ( rg.HasLibrary() ) - out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library); + // CN: + if ( rg.HasSequencingCenter() ) + out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter); // DS: if ( rg.HasDescription() ) out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description); - // PU: - if ( rg.HasPlatformUnit() ) - out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit); + // DT: + if ( rg.HasProductionDate() ) + out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate); + + // FO: + if ( rg.HasFlowOrder() ) + out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder); + + // KS: + if ( rg.HasKeySequence() ) + out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence); + + // LB: + if ( rg.HasLibrary() ) + out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library); + + // PG: + if ( rg.HasProgram() ) + out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program); // PI: if ( rg.HasPredictedInsertSize() ) out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize); - // CN: - if ( rg.HasSequencingCenter() ) - out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter); - - // DT: - if ( rg.HasProductionDate() ) - out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate); - // PL: if ( rg.HasSequencingTechnology() ) out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology); + // PU: + if ( rg.HasPlatformUnit() ) + out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit); + + // SM: + if ( rg.HasSample() ) + out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample); + // newline out << endl; } @@ -149,20 +164,31 @@ void SamFormatPrinter::PrintRG(std::stringstream& out) const { void SamFormatPrinter::PrintPG(std::stringstream& out) const { - // if header has @PG data - if ( m_header.HasProgramName() ) { + // iterate over program record entries + SamProgramConstIterator pgIter = m_header.Programs.ConstBegin(); + SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); - // @PG ID: + // @PG ID: out << Constants::SAM_PG_BEGIN_TOKEN - << FormatTag(Constants::SAM_PG_NAME_TAG, m_header.ProgramName); + << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID); + + // PN: + if ( pg.HasName() ) + out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name); + + // CL: + if ( pg.HasCommandLine() ) + out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine); - // VN: - if ( m_header.HasProgramVersion() ) - out << FormatTag(Constants::SAM_PG_VERSION_TAG, m_header.ProgramVersion); + // PP: + if ( pg.HasPreviousProgramID() ) + out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID); - // CL: - if ( m_header.HasProgramCommandLine() ) - out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, m_header.ProgramCommandLine); + // VN: + if ( pg.HasVersion() ) + out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version); // newline out << endl; diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp index 4aa6395..7ecec2c 100644 --- a/src/api/internal/SamHeaderValidator_p.cpp +++ b/src/api/internal/SamHeaderValidator_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 18 April 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for validating SamHeader data // *************************************************************************** @@ -15,11 +15,36 @@ using namespace BamTools; using namespace BamTools::Internal; +#include #include #include #include using namespace std; +namespace BamTools { +namespace Internal { + +bool caseInsensitiveCompare(const string& lhs, const string& rhs) { + + // can omit checking chars if lengths not equal + const int lhsLength = lhs.length(); + const int rhsLength = rhs.length(); + if ( lhsLength != rhsLength ) + return false; + + // do *basic* toupper checks on each string char's + for ( int i = 0; i < lhsLength; ++i ) { + if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) ) + return false; + } + + // otherwise OK + return true; +} + +} // namespace Internal +} // namespace BamTools + // ------------------------------------------------------------------------ // Allow validation rules to vary, as needed, between SAM header versions // @@ -32,7 +57,10 @@ using namespace std; // // use rule introduced with version 2.0 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0); +static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1); +static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2); static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3); +static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4); // TODO: This functionality is currently unused. // Make validation "version-aware." @@ -56,7 +84,7 @@ bool SamHeaderValidator::Validate(bool verbose) { isValid &= ValidateMetadata(); isValid &= ValidateSequenceDictionary(); isValid &= ValidateReadGroupDictionary(); - isValid &= ValidateProgramData(); + isValid &= ValidateProgramChain(); // report errors if desired if ( verbose ) { @@ -168,8 +196,6 @@ bool SamHeaderValidator::ValidateGroupOrder(void) { bool SamHeaderValidator::ValidateSequenceDictionary(void) { - // TODO: warn/error if no sequences ? - bool isValid = true; // check for unique sequence names @@ -200,9 +226,9 @@ bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { SamSequenceConstIterator seqEnd = sequences.ConstEnd(); for ( ; seqIter != seqEnd; ++seqIter ) { const SamSequence& seq = (*seqIter); - const string& name = seq.Name; // lookup sequence name + const string& name = seq.Name; nameIter = sequenceNames.find(name); // error if found (duplicate entry) @@ -269,8 +295,6 @@ bool SamHeaderValidator::CheckLengthInRange(const string& length) { bool SamHeaderValidator::ValidateReadGroupDictionary(void) { - // TODO: warn/error if no read groups ? - bool isValid = true; // check for unique read group IDs & platform units @@ -367,11 +391,13 @@ bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { return true; // if technology is valid keyword - if ( Is454(technology) || - IsHelicos(technology) || - IsIllumina(technology) || - IsPacBio(technology) || - IsSolid(technology) + if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID) ) { return true; @@ -382,38 +408,7 @@ bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { return false; } -bool SamHeaderValidator::Is454(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 || - technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER - ); -} - -bool SamHeaderValidator::IsHelicos(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER - ); -} - -bool SamHeaderValidator::IsIllumina(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER - ); -} - -bool SamHeaderValidator::IsPacBio(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER - ); -} - -bool SamHeaderValidator::IsSolid(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER - ); -} - -bool SamHeaderValidator::ValidateProgramData(void) { +bool SamHeaderValidator::ValidateProgramChain(void) { bool isValid = true; isValid &= ContainsUniqueProgramIds(); isValid &= ValidatePreviousProgramIds(); @@ -421,17 +416,60 @@ bool SamHeaderValidator::ValidateProgramData(void) { } bool SamHeaderValidator::ContainsUniqueProgramIds(void) { + bool isValid = true; - // TODO: once we have ability to handle multiple @PG entries, - // check here for duplicate ID's - // but for now, just return true + set programIds; + set::iterator pgIdIter; + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // lookup program ID + const string& pgId = pg.ID; + pgIdIter = programIds.find(pgId); + + // error if found (duplicate entry) + if ( pgIdIter != programIds.end() ) { + AddError("Program ID (ID): " + pgId + " is not unique"); + isValid = false; + } + + // otherwise ok, store ID + programIds.insert(pgId); + } + + // return validation state return isValid; } bool SamHeaderValidator::ValidatePreviousProgramIds(void) { + bool isValid = true; - // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling - // just return true for now + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // ignore record for validation if PreviousProgramID is empty + const string& ppId = pg.PreviousProgramID; + if ( ppId.empty() ) + continue; + + // see if program "chain" contains an entry for ppId + if ( !programs.Contains(ppId) ) { + AddError("PreviousProgramID (PP): " + ppId + " is not a known ID"); + isValid = false; + } + } + + // return validation state return isValid; } void SamHeaderValidator::AddError(const string& message) { diff --git a/src/api/internal/SamHeaderValidator_p.h b/src/api/internal/SamHeaderValidator_p.h index ee48f89..06a82ab 100644 --- a/src/api/internal/SamHeaderValidator_p.h +++ b/src/api/internal/SamHeaderValidator_p.h @@ -64,18 +64,13 @@ class SamHeaderValidator { // validate read group dictionary bool ValidateReadGroupDictionary(void); - bool ValidateReadGroup(const SamReadGroup& rg); bool ContainsUniqueIDsAndPlatformUnits(void); + bool ValidateReadGroup(const SamReadGroup& rg); bool CheckReadGroupID(const std::string& id); bool CheckSequencingTechnology(const std::string& technology); - bool Is454(const std::string& technology); - bool IsHelicos(const std::string& technology); - bool IsIllumina(const std::string& technology); - bool IsPacBio(const std::string& technology); - bool IsSolid(const std::string& technology); // validate program data - bool ValidateProgramData(void); + bool ValidateProgramChain(void); bool ContainsUniqueProgramIds(void); bool ValidatePreviousProgramIds(void); diff --git a/src/toolkit/bamtools_resolve.cpp b/src/toolkit/bamtools_resolve.cpp new file mode 100644 index 0000000..4c02640 --- /dev/null +++ b/src/toolkit/bamtools_resolve.cpp @@ -0,0 +1,120 @@ +// *************************************************************************** +// bamtools_resolve.cpp (c) 2011 +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Resolves paired-end reads (marking the IsProperPair flag as needed) +// *************************************************************************** + +#include "bamtools_resolve.h" + +#include +#include +#include +#include +using namespace BamTools; + +#include +#include +using namespace std; + +// --------------------------------------------- +// ResolveSettings implementation + +struct ResolveTool::ResolveSettings { + + // flags + bool HasInput; + bool HasOutput; + bool IsForceCompression; + + // filenames + string InputFilename; + string OutputFilename; + + // constructor + ResolveSettings(void) + : HasInput(false) + , HasOutput(false) + , IsForceCompression(false) + , InputFilename(Options::StandardIn()) + , OutputFilename(Options::StandardOut()) + { } +}; + +// --------------------------------------------- +// ResolveToolPrivate implementation + +struct ResolveTool::ResolveToolPrivate { + + // ctor & dtor + public: + ResolveToolPrivate(ResolveTool::ResolveSettings* settings) + : m_settings(settings) + { } + + ~ResolveToolPrivate(void) { } + + // 'public' interface + public: + bool Run(void); + + // internal methods + private: + + // data members + private: + ResolveTool::ResolveSettings* m_settings; +}; + +bool ResolveTool::ResolveToolPrivate::Run(void) { + cerr << "Resoling BAM file..." << endl; + return true; +} + +// --------------------------------------------- +// ResolveTool implementation + +ResolveTool::ResolveTool(void) + : AbstractTool() + , m_settings(new ResolveSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools resolve", "resolves paired-end reads (marking the IsProperPair flag as needed)", "[-in -in ...] [-out | [-forceCompression]] [resolveOptions]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); +} + +ResolveTool::~ResolveTool(void) { + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int ResolveTool::Help(void) { + Options::DisplayHelp(); + return 0; +} + +int ResolveTool::Run(int argc, char* argv[]) { + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize ResolveTool + m_impl = new ResolveToolPrivate(m_settings); + + // run ResolveTool, return success/failure + if ( m_impl->Run() ) return 0; + else return 1; +} diff --git a/src/toolkit/bamtools_resolve.h b/src/toolkit/bamtools_resolve.h new file mode 100644 index 0000000..96bd162 --- /dev/null +++ b/src/toolkit/bamtools_resolve.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_resolve.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Resolves paired-end reads (marking the IsProperPair flag as needed) +// *************************************************************************** + +#ifndef BAMTOOLS_RESOLVE_H +#define BAMTOOLS_RESOLVE_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class ResolveTool : public AbstractTool { + + public: + ResolveTool(void); + ~ResolveTool(void); + + public: + int Help(void); + int Run(int argc, char* argv[]); + + private: + struct ResolveSettings; + ResolveSettings* m_settings; + + struct ResolveToolPrivate; + ResolveToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_RESOLVE_H