From 2e049ed7f28881bce09653e60f5aea54bfd7afbf Mon Sep 17 00:00:00 2001 From: derek Date: Fri, 7 Oct 2011 15:12:57 -0400 Subject: [PATCH] Removed STDERR pollution by API * Accomplished this by introducing a GetErrorString() on most API objects. When a method returns false, you can ignore it, parse the error string to decide what to do next, prompt the user, make a sandwich, whatever. But nothing should leak out to the console. * Internally the error messages are passed by a new BamException class. This new exception should not cross the library boundary. The exception should be caught "under the hood" and its what() string should be (possibly formatted and) stored as the error string in one of the high- --- src/api/BamAlignment.cpp | 138 ++-- src/api/BamAlignment.h | 224 +++--- src/api/BamAux.h | 15 +- src/api/BamConstants.h | 118 +-- src/api/BamIndex.h | 20 +- src/api/BamMultiReader.cpp | 15 +- src/api/BamMultiReader.h | 13 +- src/api/BamReader.cpp | 13 +- src/api/BamReader.h | 11 +- src/api/BamWriter.cpp | 17 +- src/api/BamWriter.h | 16 +- src/api/CMakeLists.txt | 1 + src/api/SamHeader.cpp | 64 +- src/api/SamHeader.h | 27 +- src/api/internal/BamException_p.cpp | 15 + src/api/internal/BamException_p.h | 51 ++ src/api/internal/BamHeader_p.cpp | 102 ++- src/api/internal/BamHeader_p.h | 14 +- src/api/internal/BamIndexFactory_p.cpp | 12 +- src/api/internal/BamMultiMerger_p.h | 4 + src/api/internal/BamMultiReader_p.cpp | 294 +++++--- src/api/internal/BamMultiReader_p.h | 13 +- .../internal/BamRandomAccessController_p.cpp | 76 +- .../internal/BamRandomAccessController_p.h | 31 +- src/api/internal/BamReader_p.cpp | 269 ++++--- src/api/internal/BamReader_p.h | 13 +- src/api/internal/BamStandardIndex_p.cpp | 670 +++++++++--------- src/api/internal/BamStandardIndex_p.h | 91 +-- src/api/internal/BamToolsIndex_p.cpp | 510 +++++++------ src/api/internal/BamToolsIndex_p.h | 73 +- src/api/internal/BamWriter_p.cpp | 502 +++++++------ src/api/internal/BamWriter_p.h | 13 +- src/api/internal/BgzfStream_p.cpp | 362 +++++----- src/api/internal/BgzfStream_p.h | 48 +- src/api/internal/SamFormatParser_p.cpp | 88 ++- src/api/internal/SamFormatPrinter_p.cpp | 19 +- src/api/internal/SamFormatPrinter_p.h | 3 +- src/api/internal/SamHeaderValidator_p.cpp | 122 ++-- src/api/internal/SamHeaderValidator_p.h | 14 +- 39 files changed, 2310 insertions(+), 1791 deletions(-) create mode 100644 src/api/internal/BamException_p.cpp create mode 100644 src/api/internal/BamException_p.h diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp index 980303a..0eff5c7 100644 --- a/src/api/BamAlignment.cpp +++ b/src/api/BamAlignment.cpp @@ -2,7 +2,7 @@ // BamAlignment.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 October 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -10,15 +10,6 @@ #include #include using namespace BamTools; - -#include -#include -#include -#include -#include -#include -#include -#include using namespace std; /*! \class BamTools::BamAlignment @@ -162,7 +153,7 @@ bool BamAlignment::BuildCharData(void) { // calculate character lengths/offsets const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE; - const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations * 4); + const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations*4); const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2; const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength; const unsigned int tagDataLength = dataLength - tagDataOffset; @@ -185,8 +176,8 @@ bool BamAlignment::BuildCharData(void) { QueryBases.clear(); if ( hasSeqData ) { QueryBases.reserve(SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) { - char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { + const char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; QueryBases.append(1, singleBase); } } @@ -195,8 +186,8 @@ bool BamAlignment::BuildCharData(void) { Qualities.clear(); if ( hasQualData ) { Qualities.reserve(SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); + for ( size_t i = 0; i < SupportData.QuerySequenceLength; ++i ) { + const char singleQuality = static_cast(qualData[i]+33); Qualities.append(1, singleQuality); } } @@ -218,7 +209,7 @@ bool BamAlignment::BuildCharData(void) { for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); - switch (op.Type) { + switch ( op.Type ) { // for 'M', 'I', '=', 'X' - write bases case (Constants::BAM_CIGAR_MATCH_CHAR) : @@ -253,11 +244,11 @@ bool BamAlignment::BuildCharData(void) { case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : break; - // shouldn't get here + // invalid CIGAR op-code default: - cerr << "BamAlignment ERROR: invalid CIGAR operation type: " - << op.Type << endl; - exit(1); + const string message = string("invalid CIGAR operation type: ") + op.Type; + SetErrorString("BamAlignment::BuildCharData", message); + return false; } } } @@ -266,8 +257,8 @@ bool BamAlignment::BuildCharData(void) { TagData.clear(); if ( hasTagData ) { if ( IsBigEndian ) { - int i = 0; - while ( (unsigned int)i < tagDataLength ) { + size_t i = 0; + while ( i < tagDataLength ) { i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) const char type = tagData[i]; // get tag type at position i @@ -313,12 +304,12 @@ bool BamAlignment::BuildCharData(void) { // swap endian-ness of number of elements in place, then retrieve for loop BamTools::SwapEndian_32p(&tagData[i]); - int32_t numElements; + uint32_t numElements; memcpy(&numElements, &tagData[i], sizeof(uint32_t)); i += sizeof(uint32_t); // swap endian-ness of array elements - for ( int j = 0; j < numElements; ++j ) { + for ( size_t j = 0; j < numElements; ++j ) { switch (arrayType) { case (Constants::BAM_TAG_TYPE_INT8) : case (Constants::BAM_TAG_TYPE_UINT8) : @@ -337,9 +328,8 @@ bool BamAlignment::BuildCharData(void) { i += sizeof(uint32_t); break; default: - // error case - cerr << "BamAlignment ERROR: unknown binary array type encountered: " - << arrayType << endl; + const string message = string("invalid binary array type: ") + arrayType; + SetErrorString("BamAlignment::BuildCharData", message); return false; } } @@ -347,18 +337,18 @@ bool BamAlignment::BuildCharData(void) { break; } - // shouldn't get here + // invalid tag type-code default : - cerr << "BamAlignment ERROR: invalid tag value type: " - << type << endl; - exit(1); + const string message = string("invalid tag type: ") + type; + SetErrorString("BamAlignment::BuildCharData", message); + return false; } } } // store tagData in alignment TagData.resize(tagDataLength); - memcpy((char*)TagData.data(), tagData, tagDataLength); + memcpy((char*)(TagData.data()), tagData, tagDataLength); } // clear the core-only flag @@ -414,7 +404,7 @@ bool BamAlignment::BuildCharData(void) { bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, - unsigned int& numBytesParsed) + unsigned int& numBytesParsed) const { while ( numBytesParsed < tagDataLength ) { @@ -468,6 +458,8 @@ bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { */ int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { + // TODO: Come back to this for coordinate issues !!! + // initialize alignment end to starting position int alignEnd = Position; @@ -493,6 +485,18 @@ int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { return alignEnd; } +/*! \fn std::string BamAlignment::GetErrorString(void) const + \brief Returns a description of the last error that occurred + + This method allows elimnation of STDERR pollution. Developers of client code + may choose how the messages are displayed to the user, if at all. + + \return description of last error that occurred +*/ +std::string BamAlignment::GetErrorString(void) const { + return ErrorString; +} + /*! \fn bool BamAlignment::GetReadGroup(std::string& readGroup) const \brief Retrieves value of read group tag ("RG"). @@ -540,9 +544,17 @@ bool BamAlignment::GetReadGroup(std::string& readGroup) const { */ bool BamAlignment::GetTagType(const std::string& tag, char& type) const { - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) + // skip if alignment is core-only + if ( SupportData.HasCoreOnly ) { + // TODO: set error string? + return false; + } + + // skip if no tags present + if ( TagData.empty() ) { + // TODO: set error string? return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -550,8 +562,10 @@ bool BamAlignment::GetTagType(const std::string& tag, char& type) const { unsigned int numBytesParsed = 0; // if tag not found, return failure - if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ){ + // TODO: set error string? return false; + } // otherwise, retrieve & validate tag type code type = *(pTagData - 1); @@ -571,8 +585,8 @@ bool BamAlignment::GetTagType(const std::string& tag, char& type) const { // unknown tag type default: - cerr << "BamAlignment ERROR: unknown tag type encountered: " - << type << endl; + const string message = string("invalid tag type: ") + type; + SetErrorString("BamAlignment::GetTagType", message); return false; } } @@ -686,17 +700,15 @@ bool BamAlignment::IsSecondMate(void) const { \return \c true if both \a tag and \a type are correct sizes */ -bool BamAlignment::IsValidSize(const string& tag, const string& type) { +bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const { return (tag.size() == Constants::BAM_TAG_TAGSIZE) && (type.size() == Constants::BAM_TAG_TYPESIZE); } -/*! \fn bool BamAlignment::RemoveTag(const std::string& tag) +/*! \fn void BamAlignment::RemoveTag(const std::string& tag) \brief Removes field from BAM tags. - - \return \c true if tag was removed successfully (or didn't exist before) */ -bool BamAlignment::RemoveTag(const std::string& tag) { +void BamAlignment::RemoveTag(const std::string& tag) { // if char data not populated, do that first if ( SupportData.HasCoreOnly ) @@ -704,7 +716,7 @@ bool BamAlignment::RemoveTag(const std::string& tag) { // skip if no tags available if ( TagData.empty() ) - return false; + return; // localize the tag data char* pOriginalTagData = (char*)TagData.data(); @@ -713,19 +725,19 @@ bool BamAlignment::RemoveTag(const std::string& tag) { unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - // if tag not found, simply return true + // skip if tag not found if ( !FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) - return true; + return; // otherwise, remove it - char* newTagData = new char[originalTagDataLength]; + RaiiBuffer newTagData(originalTagDataLength); // copy original tag data up til desired tag pTagData -= 3; numBytesParsed -= 3; const unsigned int beginningTagDataLength = numBytesParsed; newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); + memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed); // attemp to skip to next tag const char* pTagStorageType = pTagData + 2; @@ -736,15 +748,21 @@ bool BamAlignment::RemoveTag(const std::string& tag) { // squeeze remaining tag data const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); + memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength ); // save modified tag data in alignment - TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); + TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength); } +} - // clean up & return success - delete[] newTagData; - return true; +/*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const + \internal + + Sets a formatted error string for this alignment. +*/ +void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const { + static const string SEPARATOR = ": "; + ErrorString = where + SEPARATOR + what; } /*! \fn void BamAlignment::SetIsDuplicate(bool ok) @@ -877,7 +895,7 @@ void BamAlignment::SetIsUnmapped(bool ok) { */ bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, - unsigned int& numBytesParsed) + unsigned int& numBytesParsed) const { switch (storageType) { @@ -922,7 +940,7 @@ bool BamAlignment::SkipToNextTag(const char storageType, // read number of elements int32_t numElements; - memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped if necessary + memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed numBytesParsed += sizeof(uint32_t); pTagData += sizeof(uint32_t); @@ -943,8 +961,8 @@ bool BamAlignment::SkipToNextTag(const char storageType, bytesToSkip = numElements*sizeof(uint32_t); break; default: - cerr << "BamAlignment ERROR: unknown binary array type encountered: " - << arrayType << endl; + const string message = string("invalid binary array type: ") + arrayType; + SetErrorString("BamAlignment::SkipToNextTag", message); return false; } @@ -955,11 +973,11 @@ bool BamAlignment::SkipToNextTag(const char storageType, } default: - cerr << "BamAlignment ERROR: unknown tag type encountered" - << storageType << endl; + const string message = string("invalid tag type: ") + storageType; + SetErrorString("BamAlignment::SkipToNextTag", message); return false; } - // return success + // if we get here, tag skipped OK - return success return true; } diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h index 33175b5..b096499 100644 --- a/src/api/BamAlignment.h +++ b/src/api/BamAlignment.h @@ -2,7 +2,7 @@ // BamAlignment.h (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 October 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the BamAlignment data structure // *************************************************************************** @@ -13,11 +13,8 @@ #include #include #include -#include -#include #include #include -#include #include #include @@ -75,24 +72,16 @@ struct API_EXPORT BamAlignment { public: // add a new tag - template bool AddTag(const std::string& tag, - const std::string& type, - const T& value); - template bool AddTag(const std::string& tag, - const std::vector& values); + template bool AddTag(const std::string& tag, const std::string& type, const T& value); + template bool AddTag(const std::string& tag, const std::vector& values); // edit (or append) tag - template bool EditTag(const std::string& tag, - const std::string& type, - const T& value); - template bool EditTag(const std::string& tag, - const std::vector& values); + template bool EditTag(const std::string& tag, const std::string& type, const T& value); + template bool EditTag(const std::string& tag, const std::vector& values); // retrieves tag data - template bool GetTag(const std::string& tag, - T& destination) const; - template bool GetTag(const std::string& tag, - std::vector& destination) const; + template bool GetTag(const std::string& tag, T& destination) const; + template bool GetTag(const std::string& tag, std::vector& destination) const; // retrieves the BAM type-code for requested tag // (returns whether or not tag exists, and type-code is valid) @@ -101,12 +90,12 @@ struct API_EXPORT BamAlignment { // legacy methods (consider deprecated, but still available) bool GetEditDistance(uint32_t& editDistance) const; // retrieves value of "NM" tag bool GetReadGroup(std::string& readGroup) const; // retrieves value of "RG" tag - + // returns true if alignment has a record for this tag name bool HasTag(const std::string& tag) const; // removes a tag - bool RemoveTag(const std::string& tag); + void RemoveTag(const std::string& tag); // additional methods public: @@ -116,6 +105,9 @@ struct API_EXPORT BamAlignment { // calculates alignment end position int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; + // returns a description of the last error that occurred + std::string GetErrorString(void) const; + // public data fields public: std::string Name; // read name @@ -138,15 +130,17 @@ struct API_EXPORT BamAlignment { //! \cond // internal utility methods private: - static bool FindTag(const std::string& tag, - char*& pTagData, - const unsigned int& tagDataLength, - unsigned int& numBytesParsed); - static bool IsValidSize(const std::string& tag, - const std::string& type); - static bool SkipToNextTag(const char storageType, + bool FindTag(const std::string& tag, + char*& pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) const; + bool IsValidSize(const std::string& tag, + const std::string& type) const; + void SetErrorString(const std::string& where, + const std::string& what) const; + bool SkipToNextTag(const char storageType, char*& pTagData, - unsigned int& numBytesParsed); + unsigned int& numBytesParsed) const; // internal data private: @@ -173,6 +167,8 @@ struct API_EXPORT BamAlignment { BamAlignmentSupportData SupportData; friend class Internal::BamReaderPrivate; friend class Internal::BamWriterPrivate; + + mutable std::string ErrorString; // mutable to allow updates even in logically const methods //! \endcond }; @@ -188,10 +184,17 @@ inline bool BamAlignment::AddTag(const std::string& tag, if ( SupportData.HasCoreOnly ) BuildCharData(); - // validate tag/type size & that storage type code is OK for T - if ( !IsValidSize(tag, type) ) return false; - if ( !TagTypeHelper::CanConvertTo(type.at(0)) ) + // check tag/type size + if ( !IsValidSize(tag, type) ) { + // TODO: set error string? + return false; + } + + // check that storage type code is OK for T + if ( !TagTypeHelper::CanConvertTo(type.at(0)) ) { + // TODO: set error string? return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -200,8 +203,10 @@ inline bool BamAlignment::AddTag(const std::string& tag, // if tag already exists, return false // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + // TODO: set error string? return false; + } // otherwise, convert value to string union { T value; char valueBuffer[sizeof(T)]; } un; @@ -209,20 +214,17 @@ inline bool BamAlignment::AddTag(const std::string& tag, // copy original tag data to temp buffer const std::string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T - char* originalTagData = new char[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T + RaiiBuffer originalTagData(newTagDataLength); + memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T)); + strcat(originalTagData.Buffer + tagDataLength, newTag.data()); + memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T)); // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; + const char* newTagData = (const char*)originalTagData.Buffer; TagData.assign(newTagData, newTagDataLength); - - // clean up & return success - delete[] originalTagData; return true; } @@ -235,10 +237,17 @@ inline bool BamAlignment::AddTag(const std::string& tag, if ( SupportData.HasCoreOnly ) BuildCharData(); - // validate tag/type size & that storage type code is OK for string - if ( !IsValidSize(tag, type) ) return false; - if ( !TagTypeHelper::CanConvertTo(type.at(0)) ) + // check tag/type size + if ( !IsValidSize(tag, type) ) { + // TODO: set error string? + return false; + } + + // check that storage type code is OK for string + if ( !TagTypeHelper::CanConvertTo(type.at(0)) ) { + // TODO: set error string? return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -247,24 +256,23 @@ inline bool BamAlignment::AddTag(const std::string& tag, // if tag already exists, return false // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + // TODO: set error string? return false; + } // otherwise, copy tag data to temp buffer const std::string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char* originalTagData = new char[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term + RaiiBuffer originalTagData(newTagDataLength); + memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term + // append newTag (removes original null-term, then appends newTag + null-term) + strcat(originalTagData.Buffer + tagDataLength, newTag.data()); // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; + const char* newTagData = (const char*)originalTagData.Buffer; TagData.assign(newTagData, newTagDataLength); - - // clean up & return success - delete[] originalTagData; return true; } @@ -287,8 +295,10 @@ inline bool BamAlignment::AddTag(const std::string& tag, // if tag already exists, return false // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + // TODO: set error string? return false; + } // build new tag's base information char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; @@ -301,28 +311,25 @@ inline bool BamAlignment::AddTag(const std::string& tag, memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); // copy current TagData string to temp buffer, leaving room for new tag's contents - const int newTagDataLength = tagDataLength + - Constants::BAM_TAG_ARRAYBASE_SIZE + - numElements*sizeof(T); - char* originalTagData = new char[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + const size_t newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(T); + RaiiBuffer originalTagData(newTagDataLength); + memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term // write newTagBase (removes old null term) - strcat(originalTagData + tagDataLength, (const char*)newTagBase); + strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase); // add vector elements to tag int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; for ( int i = 0 ; i < numElements; ++i ) { const T& value = values.at(i); - memcpy(originalTagData + elementsBeginOffset + i*sizeof(T), &value, sizeof(T)); + memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T)); } // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; + const char* newTagData = (const char*)originalTagData.Buffer; TagData.assign(newTagData, newTagDataLength); - - // cleanup & return success - delete[] originalTagData; return true; } @@ -359,9 +366,17 @@ template inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const { - // skip if core-only or no tags present - if ( SupportData.HasCoreOnly || TagData.empty() ) + // skip if alignment is core-only + if ( SupportData.HasCoreOnly ) { + // TODO: set error string? return false; + } + + // skip if no tags present + if ( TagData.empty() ) { + // TODO: set error string? + return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -369,13 +384,19 @@ inline bool BamAlignment::GetTag(const std::string& tag, unsigned int numBytesParsed = 0; // return failure if tag not found - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + // TODO: set error string? return false; + } - // otherwise try to copy data into destination + // fetch data type const char type = *(pTagData - 1); - if ( !TagTypeHelper::CanConvertFrom(type) ) + if ( !TagTypeHelper::CanConvertFrom(type) ) { + // TODO: set error string ? return false; + } + + // determine data length int destinationLength = 0; switch ( type ) { @@ -403,18 +424,18 @@ inline bool BamAlignment::GetTag(const std::string& tag, case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : case (Constants::BAM_TAG_TYPE_ARRAY) : - std::cerr << "BamAlignment ERROR: cannot store tag of type " << type - << " in integer destination" << std::endl; + SetErrorString("BamAlignment::GetTag", + "cannot store variable length tag data into a numeric destination"); return false; // unrecognized tag type default: - std::cerr << "BamAlignment ERROR: unknown tag type encountered: " - << type << std::endl; + const std::string message = std::string("invalid tag type: ") + type; + SetErrorString("BamAlignment::GetTag", message); return false; } - // store in destination + // store data in destination destination = 0; memcpy(&destination, pTagData, destinationLength); @@ -426,9 +447,17 @@ template<> inline bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { - // skip if core-only or no tags present - if ( SupportData.HasCoreOnly || TagData.empty() ) + // skip if alignment is core-only + if ( SupportData.HasCoreOnly ) { + // TODO: set error string? return false; + } + + // skip if no tags present + if ( TagData.empty() ) { + // TODO: set error string? + return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -436,8 +465,10 @@ inline bool BamAlignment::GetTag(const std::string& tag, unsigned int numBytesParsed = 0; // return failure if tag not found - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + // TODO: set error string? return false; + } // otherwise copy data into destination const unsigned int dataLength = strlen(pTagData); @@ -454,9 +485,17 @@ template inline bool BamAlignment::GetTag(const std::string& tag, std::vector& destination) const { - // skip if core-only or no tags present - if ( SupportData.HasCoreOnly || TagData.empty() ) + // skip if alignment is core-only + if ( SupportData.HasCoreOnly ) { + // TODO: set error string? return false; + } + + // skip if no tags present + if ( TagData.empty() ) { + // TODO: set error string? + return false; + } // localize the tag data char* pTagData = (char*)TagData.data(); @@ -464,22 +503,27 @@ inline bool BamAlignment::GetTag(const std::string& tag, unsigned int numBytesParsed = 0; // return false if tag not found - if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + // TODO: set error string? return false; + } // check that tag is array type const char tagType = *(pTagData - 1); if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { - std::cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " - << tag << " in array destination" << std::endl; + SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination"); return false; } - // calculate length of each element in tag's array + // fetch element type const char elementType = *pTagData; - if ( !TagTypeHelper::CanConvertFrom(elementType) ) + if ( !TagTypeHelper::CanConvertFrom(elementType) ) { + // TODO: set error string ? return false; + } ++pTagData; + + // calculate length of each element in tag's array int elementLength = 0; switch ( elementType ) { case (Constants::BAM_TAG_TYPE_ASCII) : @@ -503,14 +547,14 @@ inline bool BamAlignment::GetTag(const std::string& tag, case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : case (Constants::BAM_TAG_TYPE_ARRAY) : - std::cerr << "BamAlignment ERROR: array element type: " << elementType - << " cannot be stored in integer value" << std::endl; + SetErrorString("BamAlignment::GetTag", + "invalid array data, variable-length elements are not allowed"); return false; // unknown tag type default: - std::cerr << "BamAlignment ERROR: unknown element type encountered: " - << elementType << std::endl; + const std::string message = std::string("invalid array element type: ") + elementType; + SetErrorString("BamAlignment::GetTag", message); return false; } diff --git a/src/api/BamAux.h b/src/api/BamAux.h index 4c5a99a..14e6547 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -2,7 +2,7 @@ // BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides data structures & utility methods that are used throughout the API. // *************************************************************************** @@ -451,6 +451,19 @@ API_EXPORT inline unsigned short UnpackUnsignedShort(char* buffer) { return UnpackUnsignedShort( (const char*)buffer ); } +// ---------------------------------------------------------------- +// 'internal' helper structs + +struct RaiiBuffer { + RaiiBuffer(const unsigned int n) + : Buffer( new char[n]() ) + { } + ~RaiiBuffer(void) { + delete[] Buffer; + } + char* Buffer; +}; + } // namespace BamTools #endif // BAMAUX_H diff --git a/src/api/BamConstants.h b/src/api/BamConstants.h index 3e674ee..4a35d8f 100644 --- a/src/api/BamConstants.h +++ b/src/api/BamConstants.h @@ -2,7 +2,7 @@ // BamConstants.h (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 October 2011 (DB) +// Last modified: 5 October 2011 (DB) // --------------------------------------------------------------------------- // Provides basic constants for handling BAM files. // *************************************************************************** @@ -21,15 +21,15 @@ namespace BamTools { namespace Constants { -const int BAM_SIZEOF_INT = 4; +const uint8_t BAM_SIZEOF_INT = 4; // header magic number -const char* const BAM_HEADER_MAGIC = "BAM\1"; -const unsigned int BAM_HEADER_MAGIC_LENGTH = 4; +const char* const BAM_HEADER_MAGIC = "BAM\1"; +const uint8_t BAM_HEADER_MAGIC_LENGTH = 4; // BAM alignment core size -const int BAM_CORE_SIZE = 32; -const int BAM_CORE_BUFFER_SIZE = 8; +const uint8_t BAM_CORE_SIZE = 32; +const uint8_t BAM_CORE_BUFFER_SIZE = 8; // BAM alignment flags const int BAM_ALIGNMENT_PAIRED = 0x0001; @@ -46,15 +46,15 @@ const int BAM_ALIGNMENT_DUPLICATE = 0x0400; // CIGAR constants const char* const BAM_CIGAR_LOOKUP = "MIDNSHP=X"; -const int BAM_CIGAR_MATCH = 0; -const int BAM_CIGAR_INS = 1; -const int BAM_CIGAR_DEL = 2; -const int BAM_CIGAR_REFSKIP = 3; -const int BAM_CIGAR_SOFTCLIP = 4; -const int BAM_CIGAR_HARDCLIP = 5; -const int BAM_CIGAR_PAD = 6; -const int BAM_CIGAR_SEQMATCH = 7; -const int BAM_CIGAR_MISMATCH = 8; +const uint8_t BAM_CIGAR_MATCH = 0; +const uint8_t BAM_CIGAR_INS = 1; +const uint8_t BAM_CIGAR_DEL = 2; +const uint8_t BAM_CIGAR_REFSKIP = 3; +const uint8_t BAM_CIGAR_SOFTCLIP = 4; +const uint8_t BAM_CIGAR_HARDCLIP = 5; +const uint8_t BAM_CIGAR_PAD = 6; +const uint8_t BAM_CIGAR_SEQMATCH = 7; +const uint8_t BAM_CIGAR_MISMATCH = 8; const char BAM_CIGAR_MATCH_CHAR = 'M'; const char BAM_CIGAR_INS_CHAR = 'I'; @@ -66,8 +66,8 @@ const char BAM_CIGAR_PAD_CHAR = 'P'; const char BAM_CIGAR_SEQMATCH_CHAR = '='; const char BAM_CIGAR_MISMATCH_CHAR = 'X'; -const int BAM_CIGAR_SHIFT = 4; -const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); +const int BAM_CIGAR_SHIFT = 4; +const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); // BAM tag types const char BAM_TAG_TYPE_ASCII = 'A'; @@ -82,46 +82,66 @@ const char BAM_TAG_TYPE_STRING = 'Z'; const char BAM_TAG_TYPE_HEX = 'H'; const char BAM_TAG_TYPE_ARRAY = 'B'; -const size_t BAM_TAG_TAGSIZE = 2; -const size_t BAM_TAG_TYPESIZE = 1; -const int BAM_TAG_ARRAYBASE_SIZE = 8; +const uint8_t BAM_TAG_TAGSIZE = 2; +const uint8_t BAM_TAG_TYPESIZE = 1; +const uint8_t BAM_TAG_ARRAYBASE_SIZE = 8; // DNA bases const char* const BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; -const unsigned char BAM_BASECODE_EQUAL = 0; -const unsigned char BAM_BASECODE_A = 1; -const unsigned char BAM_BASECODE_C = 2; -const unsigned char BAM_BASECODE_G = 4; -const unsigned char BAM_BASECODE_T = 8; -const unsigned char BAM_BASECODE_N = 15; +const uint8_t BAM_BASECODE_EQUAL = 0; +const uint8_t BAM_BASECODE_A = 1; +const uint8_t BAM_BASECODE_C = 2; +const uint8_t BAM_BASECODE_M = 3; +const uint8_t BAM_BASECODE_G = 4; +const uint8_t BAM_BASECODE_R = 5; +const uint8_t BAM_BASECODE_S = 6; +const uint8_t BAM_BASECODE_V = 7; +const uint8_t BAM_BASECODE_T = 8; +const uint8_t BAM_BASECODE_W = 9; +const uint8_t BAM_BASECODE_Y = 10; +const uint8_t BAM_BASECODE_H = 11; +const uint8_t BAM_BASECODE_K = 12; +const uint8_t BAM_BASECODE_D = 13; +const uint8_t BAM_BASECODE_B = 14; +const uint8_t BAM_BASECODE_N = 15; -const char BAM_DNA_EQUAL = '='; -const char BAM_DNA_A = 'A'; -const char BAM_DNA_C = 'C'; -const char BAM_DNA_G = 'G'; -const char BAM_DNA_T = 'T'; -const char BAM_DNA_N = 'N'; -const char BAM_DNA_DEL = '-'; -const char BAM_DNA_PAD = '*'; +const char BAM_DNA_EQUAL = '='; +const char BAM_DNA_A = 'A'; +const char BAM_DNA_C = 'C'; +const char BAM_DNA_M = 'M'; +const char BAM_DNA_G = 'G'; +const char BAM_DNA_R = 'R'; +const char BAM_DNA_S = 'S'; +const char BAM_DNA_V = 'V'; +const char BAM_DNA_T = 'T'; +const char BAM_DNA_W = 'W'; +const char BAM_DNA_Y = 'Y'; +const char BAM_DNA_H = 'H'; +const char BAM_DNA_K = 'K'; +const char BAM_DNA_D = 'D'; +const char BAM_DNA_B = 'B'; +const char BAM_DNA_N = 'N'; +const char BAM_DNA_DEL = '-'; +const char BAM_DNA_PAD = '*'; // zlib constants -const int GZIP_ID1 = 31; -const int GZIP_ID2 = 139; -const int CM_DEFLATE = 8; -const int FLG_FEXTRA = 4; -const int OS_UNKNOWN = 255; -const int BGZF_XLEN = 6; -const int BGZF_ID1 = 66; -const int BGZF_ID2 = 67; -const int BGZF_LEN = 2; -const int GZIP_WINDOW_BITS = -15; -const int Z_DEFAULT_MEM_LEVEL = 8; +const char GZIP_ID1 = 31; +const char GZIP_ID2 = 139; +const char CM_DEFLATE = 8; +const char FLG_FEXTRA = 4; +const char OS_UNKNOWN = 255; +const char BGZF_XLEN = 6; +const char BGZF_ID1 = 66; +const char BGZF_ID2 = 67; +const char BGZF_LEN = 2; +const int8_t GZIP_WINDOW_BITS = -15; +const int8_t Z_DEFAULT_MEM_LEVEL = 8; // BZGF constants -const int BGZF_BLOCK_HEADER_LENGTH = 18; -const int BGZF_BLOCK_FOOTER_LENGTH = 8; -const int BGZF_MAX_BLOCK_SIZE = 65536; -const int BGZF_DEFAULT_BLOCK_SIZE = 65536; +const uint8_t BGZF_BLOCK_HEADER_LENGTH = 18; +const uint8_t BGZF_BLOCK_FOOTER_LENGTH = 8; +const uint32_t BGZF_MAX_BLOCK_SIZE = 65536; +const uint32_t BGZF_DEFAULT_BLOCK_SIZE = 65536; } // namespace Constants diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index a52922e..b202e92 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -2,7 +2,7 @@ // BamIndex.h (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 5 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides basic BAM index interface // *************************************************************************** @@ -56,22 +56,36 @@ class API_EXPORT BamIndex { // index interface public: // builds index from associated BAM file & writes out to index file - virtual bool Create(void) =0; // creates index file from BAM file + virtual bool Create(void) =0; + + // returns a human-readable description of the last error encountered + std::string GetErrorString(void) { return m_errorString; } + // returns whether reference has alignments or no virtual bool HasAlignments(const int& referenceID) const =0; + // attempts to use index data to jump to @region, returns success/fail // a "successful" jump indicates no error, but not whether this region has data // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; + // loads existing data from file into memory virtual bool Load(const std::string& filename) =0; + // change the index caching behavior virtual void SetCacheMode(const BamIndex::IndexCacheMode& mode) =0; + // internal methods + protected: + void SetErrorString(const std::string& where, const std::string& what) { + m_errorString = where + ": " + what; + } + // data members protected: - Internal::BamReaderPrivate* m_reader; // copy, not ownedprivate: + Internal::BamReaderPrivate* m_reader; // copy, not owned + std::string m_errorString; }; } // namespace BamTools diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index 0079871..562b899 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -2,7 +2,7 @@ // BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 1 October 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // @@ -46,8 +46,8 @@ BamMultiReader::~BamMultiReader(void) { \sa CloseFile(), IsOpen(), Open(), BamReader::Close() */ -void BamMultiReader::Close(void) { - d->Close(); +bool BamMultiReader::Close(void) { + return d->Close(); } /*! \fn void BamMultiReader::CloseFile(const std::string& filename) @@ -57,8 +57,8 @@ void BamMultiReader::Close(void) { \sa Close(), IsOpen(), Open(), BamReader::Close() */ -void BamMultiReader::CloseFile(const std::string& filename) { - d->CloseFile(filename); +bool BamMultiReader::CloseFile(const std::string& filename) { + return d->CloseFile(filename); } /*! \fn bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) @@ -86,6 +86,11 @@ const std::vector BamMultiReader::Filenames(void) const { return d->Filenames(); } +// returns a description of the last error that occurred +std::string BamMultiReader::GetErrorString(void) const { + return d->GetErrorString(); +} + /*! \fn SamHeader BamMultiReader::GetHeader(void) const \brief Returns unified SAM-format header for all files diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index 959048c..1b1c270 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -2,7 +2,7 @@ // BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 1 October 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // *************************************************************************** @@ -38,9 +38,9 @@ class API_EXPORT BamMultiReader { // ---------------------- // closes all open BAM files - void Close(void); + bool Close(void); // close only the requested BAM file - void CloseFile(const std::string& filename); + bool CloseFile(const std::string& filename); // returns list of filenames for all open BAM files const std::vector Filenames(void) const; // returns true if multireader has any open BAM files @@ -100,6 +100,13 @@ class API_EXPORT BamMultiReader { // changes the caching behavior of the index data void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + // ---------------------- + // error handling + // ---------------------- + + // returns a human-readable description of the last error that occurred + std::string GetErrorString(void) const; + // private implementation private: Internal::BamMultiReaderPrivate* d; diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index 4b25b15..19dd135 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -2,7 +2,7 @@ // BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides read access to BAM files. // *************************************************************************** @@ -38,15 +38,16 @@ BamReader::~BamReader(void) { d = 0; } -/*! \fn void BamReader::Close(void) +/*! \fn bool BamReader::Close(void) \brief Closes the current BAM file. Also clears out all header and reference data. + \return \c true if file closed OK \sa IsOpen(), Open() */ -void BamReader::Close(void) { - d->Close(); +bool BamReader::Close(void) { + return d->Close(); } /*! \fn bool BamReader::CreateIndex(const BamIndex::IndexType& type) @@ -60,6 +61,10 @@ bool BamReader::CreateIndex(const BamIndex::IndexType& type) { return d->CreateIndex(type); } +string BamReader::GetErrorString(void) const { + return d->GetErrorString(); +} + /*! \fn const std::string BamReader::GetFilename(void) const \brief Returns name of current BAM file. diff --git a/src/api/BamReader.h b/src/api/BamReader.h index 836fbf5..ce04af6 100644 --- a/src/api/BamReader.h +++ b/src/api/BamReader.h @@ -2,7 +2,7 @@ // BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides read access to BAM files. // *************************************************************************** @@ -37,7 +37,7 @@ class API_EXPORT BamReader { // ---------------------- // closes the current BAM file - void Close(void); + bool Close(void); // returns filename of current BAM file const std::string GetFilename(void) const; // returns true if a BAM file is open for reading @@ -102,6 +102,13 @@ class API_EXPORT BamReader { // changes the caching behavior of the index data void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + // ---------------------- + // error handling + // ---------------------- + + // returns a human-readable description of the last error that occurred + std::string GetErrorString(void) const; + // deprecated methods public: // returns true if index data is available diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp index 8bf6f18..3912245 100644 --- a/src/api/BamWriter.cpp +++ b/src/api/BamWriter.cpp @@ -2,7 +2,7 @@ // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 4 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -13,8 +13,6 @@ #include using namespace BamTools; using namespace BamTools::Internal; - -#include using namespace std; /*! \class BamTools::BamWriter @@ -57,6 +55,11 @@ void BamWriter::Close(void) { d->Close(); } +// returns a human-readable description of the last error that occurred +std::string BamWriter::GetErrorString(void) const { + return d->GetErrorString(); +} + /*! \fn bool BamWriter::IsOpen(void) const \brief Returns \c true if BAM file is open for writing. \sa Open() @@ -115,11 +118,11 @@ bool BamWriter::Open(const std::string& filename, \param alignment BamAlignment record to save \sa BamReader::GetNextAlignment(), BamReader::GetNextAlignmentCore() */ -void BamWriter::SaveAlignment(const BamAlignment& alignment) { - d->SaveAlignment(alignment); +bool BamWriter::SaveAlignment(const BamAlignment& alignment) { + return d->SaveAlignment(alignment); } -/*! \fn void BamWriter::SetCompressionMode(const CompressionMode& compressionMode) +/*! \fn void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode) \brief Sets the output compression mode. Default mode is BamWriter::Compressed. @@ -137,6 +140,6 @@ void BamWriter::SaveAlignment(const BamAlignment& alignment) { \param compressionMode desired output compression behavior \sa IsOpen(), Open() */ -void BamWriter::SetCompressionMode(const CompressionMode& compressionMode) { +void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode) { d->SetWriteCompressed( compressionMode == BamWriter::Compressed ); } diff --git a/src/api/BamWriter.h b/src/api/BamWriter.h index 56ac301..5e8d21f 100644 --- a/src/api/BamWriter.h +++ b/src/api/BamWriter.h @@ -2,7 +2,7 @@ // BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 4 March 2011 (DB) +// Last modified: 5 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -25,9 +25,11 @@ namespace Internal { class API_EXPORT BamWriter { - public: enum CompressionMode { Compressed = 0 - , Uncompressed - }; + // enums + public: + enum CompressionMode { Compressed = 0 + , Uncompressed + }; // ctor & dtor public: @@ -38,6 +40,8 @@ class API_EXPORT BamWriter { public: // closes the current BAM file void Close(void); + // returns a human-readable description of the last error that occurred + std::string GetErrorString(void) const; // returns true if BAM file is open for writing bool IsOpen(void) const; // opens a BAM file for writing @@ -49,9 +53,9 @@ class API_EXPORT BamWriter { const SamHeader& samHeader, const RefVector& referenceSequences); // saves the alignment to the alignment archive - void SaveAlignment(const BamAlignment& alignment); + bool SaveAlignment(const BamAlignment& alignment); // sets the output compression mode - void SetCompressionMode(const CompressionMode& compressionMode); + void SetCompressionMode(const BamWriter::CompressionMode& compressionMode); // private implementation private: diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 7306f56..62d2e7f 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -25,6 +25,7 @@ set( BamToolsAPISources SamReadGroupDictionary.cpp SamSequence.cpp SamSequenceDictionary.cpp + internal/BamException_p.cpp internal/BamHeader_p.cpp internal/BamIndexFactory_p.cpp internal/BamMultiReader_p.cpp diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp index 77e1c6d..363fb5c 100644 --- a/src/api/SamHeader.cpp +++ b/src/api/SamHeader.cpp @@ -2,13 +2,14 @@ // SamHeader.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 19 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM header data fields. // *************************************************************************** #include #include +#include #include #include #include @@ -63,8 +64,7 @@ SamHeader::SamHeader(const std::string& headerText) , SortOrder(Constants::SAM_HD_SORTORDER_UNKNOWN) , GroupOrder("") { - SamFormatParser parser(*this); - parser.Parse(headerText); + SetHeaderText(headerText); } /*! \fn SamHeader::SamHeader(const SamHeader& other) @@ -88,6 +88,8 @@ SamHeader::~SamHeader(void) { } \brief Clears all header contents. */ void SamHeader::Clear(void) { + + // clear SAM header components Version.clear(); SortOrder.clear(); GroupOrder.clear(); @@ -95,6 +97,23 @@ void SamHeader::Clear(void) { ReadGroups.Clear(); Programs.Clear(); Comments.clear(); + + // clear error string + m_errorString.clear(); +} + +/*! \fn std::string SamHeader::GetErrorString(void) const + \brief Returns human-readable description of last error encountered +*/ +std::string SamHeader::GetErrorString(void) const { + return m_errorString; +} + +/*! \fn bool SamHeader::HasError(void) const + \brief Returns \c true if header encountered an error +*/ +bool SamHeader::HasError(void) const { + return (!m_errorString.empty()); } /*! \fn bool SamHeader::HasVersion(void) const @@ -149,12 +168,32 @@ bool SamHeader::HasComments(void) const { /*! \fn bool SamHeader::IsValid(bool verbose = false) const \brief Checks header contents for required data and proper formatting. \param verbose If set to true, validation errors & warnings will be printed to stderr. - Otherwise, output is suppressed and only validation check occurs. + Otherwise, messages are available through SamHeader::GetErrorString(). \return \c true if SAM header is well-formed */ bool SamHeader::IsValid(bool verbose) const { + SamHeaderValidator validator(*this); - return validator.Validate(verbose); + + // if SAM header is valid, return success + if ( validator.Validate() ) + return true; + + // otherwiser + else { + + // print messages to stderr + if ( verbose ) + validator.PrintMessages(std::cerr); + + // or catch in local error string + else { + stringstream errorStream(""); + validator.PrintMessages(errorStream); + m_errorString = errorStream.str(); + } + return false; + } } /*! \fn void SamHeader::SetHeaderText(const std::string& headerText) @@ -166,9 +205,18 @@ void SamHeader::SetHeaderText(const std::string& headerText) { // clear prior data Clear(); - // parse header text into data - SamFormatParser parser(*this); - parser.Parse(headerText); + try { + SamFormatParser parser(*this); + parser.Parse(headerText); + } catch ( BamException& e ) { + + // clear anything parsed so far + // no telling what's valid and what's partially parsed + Clear(); + + // set error string + m_errorString = e.what(); + } } /*! \fn std::string SamHeader::ToString(void) const diff --git a/src/api/SamHeader.h b/src/api/SamHeader.h index 3b67621..707edbe 100644 --- a/src/api/SamHeader.h +++ b/src/api/SamHeader.h @@ -2,7 +2,7 @@ // SamHeader.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 18 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM header data fields. // *************************************************************************** @@ -28,27 +28,29 @@ struct API_EXPORT SamHeader { // query/modify entire SamHeader void Clear(void); // clears all header contents + std::string GetErrorString(void) const; + bool HasError(void) const; bool IsValid(bool verbose = false) const; // returns true if SAM header is well-formed void SetHeaderText(const std::string& headerText); // replaces data fields with contents of SAM-formatted text std::string ToString(void) const; // returns the printable, SAM-formatted header text // convenience query methods - bool HasVersion(void) const; // returns true if header contains format version entry - bool HasSortOrder(void) const; // returns true if header contains sort order entry - bool HasGroupOrder(void) const; // returns true if header contains group order entry - bool HasSequences(void) const; // returns true if header contains any sequence entries - bool HasReadGroups(void) const; // returns true if header contains any read group entries - bool HasPrograms(void) const; // returns true if header contains any program record entries - bool HasComments(void) const; // returns true if header contains comments + bool HasVersion(void) const; // returns true if header contains format version entry + bool HasSortOrder(void) const; // returns true if header contains sort order entry + bool HasGroupOrder(void) const; // returns true if header contains group order entry + bool HasSequences(void) const; // returns true if header contains any sequence entries + bool HasReadGroups(void) const; // returns true if header contains any read group entries + bool HasPrograms(void) const; // returns true if header contains any program record entries + bool HasComments(void) const; // returns true if header contains comments // -------------- // data members // -------------- // header metadata (@HD line) - std::string Version; // VN: *Required for valid SAM header, if @HD record is present* - std::string SortOrder; // SO: - std::string GroupOrder; // GO: + std::string Version; // VN: *Required, if @HD record is present* + std::string SortOrder; // SO: + std::string GroupOrder; // GO: // header sequences (@SQ entries) SamSequenceDictionary Sequences; @@ -61,6 +63,9 @@ struct API_EXPORT SamHeader { // header comments (@CO entries) std::vector Comments; + + private: + mutable std::string m_errorString; }; } // namespace BamTools diff --git a/src/api/internal/BamException_p.cpp b/src/api/internal/BamException_p.cpp new file mode 100644 index 0000000..38241d8 --- /dev/null +++ b/src/api/internal/BamException_p.cpp @@ -0,0 +1,15 @@ +// *************************************************************************** +// BamException_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 5 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a basic exception class for BamTools internals +// *************************************************************************** + +#include +using namespace BamTools; +using namespace BamTools::Internal; +using namespace std; + +const string BamException::SEPARATOR = ": "; diff --git a/src/api/internal/BamException_p.h b/src/api/internal/BamException_p.h new file mode 100644 index 0000000..5199737 --- /dev/null +++ b/src/api/internal/BamException_p.h @@ -0,0 +1,51 @@ +// *************************************************************************** +// BamException_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 6 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a basic exception class for BamTools internals +// *************************************************************************** + +#ifndef BAMEXCEPTION_P_H +#define BAMEXCEPTION_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { +namespace Internal { + +class BamException : public std::exception { + + public: + inline BamException(const std::string& where, const std::string& message) + : std::exception() + , m_errorString(where + SEPARATOR + message) + { } + + inline ~BamException(void) throw() { } + + inline const char* what(void) const throw() { + return m_errorString.c_str(); + } + + private: + std::string m_errorString; + static const std::string SEPARATOR; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMEXCEPTION_P_H diff --git a/src/api/internal/BamHeader_p.cpp b/src/api/internal/BamHeader_p.cpp index 48b8c09..6ff2f4b 100644 --- a/src/api/internal/BamHeader_p.cpp +++ b/src/api/internal/BamHeader_p.cpp @@ -2,24 +2,37 @@ // BamHeader_p.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for handling BAM headers. // *************************************************************************** #include #include +#include #include #include using namespace BamTools; using namespace BamTools::Internal; -#include #include #include -#include using namespace std; +// ------------------------ +// static utility methods +// ------------------------ + +static inline +bool isValidMagicNumber(const char* buffer) { + return ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, + Constants::BAM_HEADER_MAGIC_LENGTH) == 0 ); +} + +// -------------------------- +// BamHeader implementation +// -------------------------- + // ctor BamHeader::BamHeader(void) { } @@ -27,23 +40,17 @@ BamHeader::BamHeader(void) { } BamHeader::~BamHeader(void) { } // reads magic number from BGZF stream, returns true if valid -bool BamHeader::CheckMagicNumber(BgzfStream* stream) { +void BamHeader::CheckMagicNumber(BgzfStream* stream) { // try to read magic number char buffer[Constants::BAM_HEADER_MAGIC_LENGTH]; - if ( stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH) != (int)Constants::BAM_HEADER_MAGIC_LENGTH ) { - fprintf(stderr, "BamHeader ERROR: could not read magic number\n"); - return false; - } + const size_t numBytesRead = stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH); + if ( numBytesRead != (int)Constants::BAM_HEADER_MAGIC_LENGTH ) + throw BamException("BamHeader::CheckMagicNumber", "could not read magic number"); // validate magic number - if ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH) != 0 ) { - fprintf(stderr, "BamHeader ERROR: invalid magic number\n"); - return false; - } - - // all checks out - return true; + if ( !isValidMagicNumber(buffer) ) + throw BamException("BamHeader::CheckMagicNumber", "invalid magic number"); } // clear SamHeader data @@ -57,68 +64,49 @@ bool BamHeader::IsValid(void) const { } // load BAM header ('magic number' and SAM header text) from BGZF stream -// returns true if all OK -bool BamHeader::Load(BgzfStream* stream) { +void BamHeader::Load(BgzfStream* stream) { - // cannot load if invalid stream - if ( stream == 0 ) - return false; + // read & check magic number + CheckMagicNumber(stream); - // cannot load if magic number is invalid - if ( !CheckMagicNumber(stream) ) - return false; - - // cannot load header if cannot read header length + // read header (length, then actual text) uint32_t length(0); - if ( !ReadHeaderLength(stream, length) ) - return false; - - // cannot load header if cannot read header text - if ( !ReadHeaderText(stream, length) ) - return false; - - // otherwise, everything OK - return true; + ReadHeaderLength(stream, length); + ReadHeaderText(stream, length); } // reads SAM header text length from BGZF stream, stores it in @length -// returns read success/fail status -bool BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) { +void BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) { - // attempt to read BAM header text length + // read BAM header text length char buffer[sizeof(uint32_t)]; - if ( stream->Read(buffer, sizeof(uint32_t)) != sizeof(uint32_t) ) { - fprintf(stderr, "BamHeader ERROR: could not read header length\n"); - return false; - } + const size_t numBytesRead = stream->Read(buffer, sizeof(uint32_t)); + if ( numBytesRead != sizeof(uint32_t) ) + throw BamException("BamHeader::ReadHeaderLength", "could not read header length"); - // convert char buffer to length, return success + // convert char buffer to length length = BamTools::UnpackUnsignedInt(buffer); if ( BamTools::SystemIsBigEndian() ) BamTools::SwapEndian_32(length); - return true; } // reads SAM header text from BGZF stream, stores in SamHeader object -// returns read success/fail status -bool BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) { +void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) { - // set up destination buffer + // read header text char* headerText = (char*)calloc(length + 1, 1); + const size_t bytesRead = stream->Read(headerText, length); - // attempt to read header text - const unsigned bytesRead = stream->Read(headerText, length); - const bool readOk = ( bytesRead == length ); - if ( readOk ) - m_header.SetHeaderText( (string)((const char*)headerText) ); - else - fprintf(stderr, "BamHeader ERROR: could not read header text\n"); + // if error reading, clean up buffer & throw + if ( bytesRead != length ) { + free(headerText); + throw BamException("BamHeader::ReadHeaderText", "could not read header text"); + } - // clean up calloc-ed temp variable (on success or fail) + // otherwise, text was read OK + // store & cleanup + m_header.SetHeaderText( (string)((const char*)headerText) ); free(headerText); - - // return read success - return readOk; } // returns *copy* of SamHeader data object diff --git a/src/api/internal/BamHeader_p.h b/src/api/internal/BamHeader_p.h index 3738c60..42af68c 100644 --- a/src/api/internal/BamHeader_p.h +++ b/src/api/internal/BamHeader_p.h @@ -2,7 +2,7 @@ // BamHeader_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 26 January 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for handling BAM headers. // *************************************************************************** @@ -43,7 +43,7 @@ class BamHeader { bool IsValid(void) const; // load BAM header ('magic number' and SAM header text) from BGZF stream // returns true if all OK - bool Load(BgzfStream* stream); + void Load(BgzfStream* stream); // returns (editable) copy of SamHeader data object SamHeader ToSamHeader(void) const; // returns SAM-formatted string of header data @@ -51,14 +51,12 @@ class BamHeader { // internal methods private: - // reads magic number from BGZF stream, returns true if valid - bool CheckMagicNumber(BgzfStream* stream); + // reads magic number from BGZF stream + void CheckMagicNumber(BgzfStream* stream); // reads SAM header length from BGZF stream, stores it in @length - // returns read success/fail status - bool ReadHeaderLength(BgzfStream* stream, uint32_t& length); + void ReadHeaderLength(BgzfStream* stream, uint32_t& length); // reads SAM header text from BGZF stream, stores in SamHeader object - // returns read success/fail status - bool ReadHeaderText(BgzfStream* stream, const uint32_t& length); + void ReadHeaderText(BgzfStream* stream, const uint32_t& length); // data members private: diff --git a/src/api/internal/BamIndexFactory_p.cpp b/src/api/internal/BamIndexFactory_p.cpp index c91f433..4e9d1f2 100644 --- a/src/api/internal/BamIndexFactory_p.cpp +++ b/src/api/internal/BamIndexFactory_p.cpp @@ -2,7 +2,7 @@ // BamIndexFactory_p.cpp (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 5 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides interface for generating BamIndex implementations // *************************************************************************** @@ -13,8 +13,6 @@ #include using namespace BamTools; using namespace BamTools::Internal; - -#include using namespace std; // generates index filename from BAM filename (depending on requested type) @@ -26,7 +24,6 @@ const string BamIndexFactory::CreateIndexFilename(const string& bamFilename, case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() ); case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() ); default : - cerr << "BamIndexFactory ERROR: unknown index type" << type << endl; return string(); } } @@ -59,7 +56,6 @@ BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type, case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader); case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader); default : - cerr << "BamIndexFactory ERROR: unknown index type " << type << endl; return 0; } } @@ -72,7 +68,7 @@ const string BamIndexFactory::FileExtension(const string& filename) { return string(); // look for last dot in filename - size_t lastDotPosition = filename.find_last_of('.'); + const size_t lastDotPosition = filename.find_last_of('.'); // if none found, return empty string if ( lastDotPosition == string::npos ) @@ -88,6 +84,10 @@ const string BamIndexFactory::FileExtension(const string& filename) { const string BamIndexFactory::FindIndexFilename(const string& bamFilename, const BamIndex::IndexType& preferredType) { + // skip if BAM filename provided is empty + if ( bamFilename.empty() ) + return string(); + // try to find index of preferred type first // return index filename if found string indexFilename = CreateIndexFilename(bamFilename, preferredType); diff --git a/src/api/internal/BamMultiMerger_p.h b/src/api/internal/BamMultiMerger_p.h index 9248df0..2b7b110 100644 --- a/src/api/internal/BamMultiMerger_p.h +++ b/src/api/internal/BamMultiMerger_p.h @@ -121,6 +121,10 @@ class MultiMerger : public IMultiMerger { template inline void MultiMerger::Add(MergeItem item) { + + // N.B. - any future custom Compare types must define this method + // see algorithms/Sort.h + if ( CompareType::UsesCharData() ) item.Alignment->BuildCharData(); m_data.insert(item); diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp index 292c287..f92258d 100644 --- a/src/api/internal/BamMultiReader_p.cpp +++ b/src/api/internal/BamMultiReader_p.cpp @@ -2,7 +2,7 @@ // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 3 October 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -29,24 +29,45 @@ BamMultiReaderPrivate::BamMultiReaderPrivate(void) // dtor BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { - - // close all open BAM readers (& clean up cache) Close(); } // close all BAM files -void BamMultiReaderPrivate::Close(void) { - CloseFiles( Filenames() ); +bool BamMultiReaderPrivate::Close(void) { + + m_errorString.clear(); + + if ( CloseFiles(Filenames()) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error encountered while closing all files: \n\t") + currentError; + SetErrorString("BamMultiReader::Close", message); + return false; + } } // close requested BAM file -void BamMultiReaderPrivate::CloseFile(const string& filename) { +bool BamMultiReaderPrivate::CloseFile(const string& filename) { + + m_errorString.clear(); + vector filenames(1, filename); - CloseFiles(filenames); + if ( CloseFiles(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error while closing file: ") + filename + "\n" + currentError; + SetErrorString("BamMultiReader::CloseFile", message); + return false; + } } // close requested BAM files -void BamMultiReaderPrivate::CloseFiles(const vector& filenames) { +bool BamMultiReaderPrivate::CloseFiles(const vector& filenames) { + + bool errorsEncountered = false; + m_errorString.clear(); // iterate over filenames vector::const_iterator filesIter = filenames.begin(); @@ -70,7 +91,12 @@ void BamMultiReaderPrivate::CloseFiles(const vector& filenames) { m_alignmentCache->Remove(reader); // clean up reader & its alignment - reader->Close(); + if ( !reader->Close() ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } delete reader; reader = 0; @@ -95,12 +121,16 @@ void BamMultiReaderPrivate::CloseFiles(const vector& filenames) { delete m_alignmentCache; m_alignmentCache = 0; } + + // return whether all readers closed OK + return !errorsEncountered; } // creates index files for BAM files that don't have them bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { - bool result = true; + bool errorsEncountered = false; + m_errorString.clear(); // iterate over readers vector::iterator itemIter = m_readers.begin(); @@ -111,11 +141,24 @@ bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { if ( reader == 0 ) continue; // if reader doesn't have an index, create one - if ( !reader->HasIndex() ) - result &= reader->CreateIndex(type); + if ( !reader->HasIndex() ) { + if ( !reader->CreateIndex(type) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } } - return result; + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while creating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::CreateIndexes", message); + return false; + } else + return true; } IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const { @@ -159,6 +202,10 @@ const vector BamMultiReaderPrivate::Filenames(void) const { return filenames; } +string BamMultiReaderPrivate::GetErrorString(void) const { + return m_errorString; +} + SamHeader BamMultiReaderPrivate::GetHeader(void) const { const string& text = GetHeaderText(); return SamHeader(text); @@ -225,48 +272,42 @@ bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { int BamMultiReaderPrivate::GetReferenceCount(void) const { // handle empty multireader - if ( m_readers.empty() ) - return 0; + if ( m_readers.empty() ) return 0; // return reference count from first reader const MergeItem& item = m_readers.front(); const BamReader* reader = item.Reader; - if ( reader ) return reader->GetReferenceCount(); - - // invalid reader - return 0; + if ( reader == 0 ) return 0; + else + return reader->GetReferenceCount(); } // returns vector of reference objects const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { // handle empty multireader - if ( m_readers.empty() ) - return RefVector(); + if ( m_readers.empty() ) return RefVector(); // return reference data from first BamReader const MergeItem& item = m_readers.front(); const BamReader* reader = item.Reader; - if ( reader ) return reader->GetReferenceData(); - - // invalid reader - return RefVector(); + if ( reader == 0 ) return RefVector(); + else + return reader->GetReferenceData(); } // returns refID from reference name int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { // handle empty multireader - if ( m_readers.empty() ) - return -1; + if ( m_readers.empty() ) return -1; // return reference ID from first BamReader const MergeItem& item = m_readers.front(); const BamReader* reader = item.Reader; - if ( reader ) return reader->GetReferenceID(refName); - - // invalid reader - return -1; + if ( reader == 0 ) return -1; + else + return reader->GetReferenceID(refName); } // --------------------------------------------------------------------------------------- @@ -330,11 +371,8 @@ bool BamMultiReaderPrivate::Jump(int refID, int position) { BamReader* reader = item.Reader; if ( reader == 0 ) continue; - // attempt jump() on each - if ( !reader->Jump(refID, position) ) { - cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() - << " to " << refID << ":" << position << endl; - } + // jump in each BamReader to position of interest + reader->Jump(refID, position); } // returns status of cache update @@ -344,7 +382,8 @@ bool BamMultiReaderPrivate::Jump(int refID, int position) { // locate (& load) index files for BAM readers that don't already have one loaded bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) { - bool result = true; + bool errorsEncountered = false; + m_errorString.clear(); // iterate over readers vector::iterator readerIter = m_readers.begin(); @@ -355,25 +394,41 @@ bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredTy if ( reader == 0 ) continue; // if reader has no index, try to locate one - if ( !reader->HasIndex() ) - result &= reader->LocateIndex(preferredType); + if ( !reader->HasIndex() ) { + if ( !reader->LocateIndex(preferredType) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } } - return result; + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while locating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::LocatingIndexes", message); + return false; + } else + return true; } // opens BAM files bool BamMultiReaderPrivate::Open(const vector& filenames) { - bool openedOk = true; - - // put all current readers back at beginning - openedOk &= Rewind(); + m_errorString.clear(); // put all current readers back at beginning (refreshes alignment cache) - Rewind(); + if ( !Rewind() ) { + const string currentError = m_errorString; + const string message = string("unable to rewind existing readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } // iterate over filenames + bool errorsEncountered = false; vector::const_iterator filenameIter = filenames.begin(); vector::const_iterator filenameEnd = filenames.end(); for ( ; filenameIter != filenameEnd; ++filenameIter ) { @@ -388,27 +443,48 @@ bool BamMultiReaderPrivate::Open(const vector& filenames) { if ( readerOpened ) m_readers.push_back( MergeItem(reader, new BamAlignment) ); - // otherwise clean up invalid reader - else delete reader; + // otherwise store error & clean up invalid reader + else { + m_errorString.append(1, '\t'); + m_errorString += string("unable to open file: ") + filename; + m_errorString.append(1, '\n'); + errorsEncountered = true; - // update method return status - openedOk &= readerOpened; + delete reader; + reader = 0; + } } - // if more than one reader open, check for consistency - if ( m_readers.size() > 1 ) - openedOk &= ValidateReaders(); + // check for errors while opening + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("unable to open all files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } - // update alignment cache - openedOk &= UpdateAlignmentCache(); + // check for BAM file consistency + if ( !ValidateReaders() ) { + const string currentError = m_errorString; + const string message = string("unable to open inconsistent files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } - // return success - return openedOk; + // update alignment cache + return UpdateAlignmentCache(); } bool BamMultiReaderPrivate::OpenFile(const std::string& filename) { vector filenames(1, filename); - return Open(filenames); + if ( Open(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("could not open file: ") + filename + "\n\t" + currentError; + SetErrorString("BamMultiReader::OpenFile", message); + return false; + } } bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { @@ -418,11 +494,14 @@ bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { // first reader without an index? // make sure same number of index filenames as readers - if ( m_readers.size() != indexFilenames.size() ) + if ( m_readers.size() != indexFilenames.size() ) { + const string message("size of index file list does not match current BAM file count"); + SetErrorString("BamMultiReader::OpenIndexes", message); return false; + } - // init result flag - bool result = true; + bool errorsEncountered = false; + m_errorString.clear(); // iterate over BamReaders vector::const_iterator indexFilenameIter = indexFilenames.begin(); @@ -436,7 +515,12 @@ bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { // open index filename on reader if ( reader ) { const string& indexFilename = (*indexFilenameIter); - result &= reader->OpenIndex(indexFilename); + if ( !reader->OpenIndex(indexFilename) ) { + m_errorString.append(1, '\t'); + m_errorString += reader->GetErrorString(); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } } // increment filename iterator, skip if no more index files to open @@ -444,13 +528,16 @@ bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { break; } - // TODO: any validation needed here?? - // return success/fail - return result; + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("could not open all index files: \n\t") + currentError; + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } else + return true; } - bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) { // skip if no alignments available @@ -475,8 +562,6 @@ bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool // load next alignment from reader & store in cache SaveNextAlignment(reader, alignment); - - // return success return true; } @@ -485,7 +570,9 @@ bool BamMultiReaderPrivate::Rewind(void) { // attempt to rewind files if ( !RewindReaders() ) { - cerr << "BamMultiReader ERROR: could not rewind file(s) successfully"; + const string currentError = m_errorString; + const string message = string("could not rewind readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Rewind", message); return false; } @@ -496,7 +583,8 @@ bool BamMultiReaderPrivate::Rewind(void) { // returns BAM file pointers to beginning of alignment data bool BamMultiReaderPrivate::RewindReaders(void) { - bool result = true; + m_errorString.clear(); + bool errorsEncountered = false; // iterate over readers vector::iterator readerIter = m_readers.begin(); @@ -507,19 +595,32 @@ bool BamMultiReaderPrivate::RewindReaders(void) { if ( reader == 0 ) continue; // attempt rewind on BamReader - result &= reader->Rewind(); + if ( !reader->Rewind() ) { + m_errorString.append(1, '\t'); + m_errorString.append( reader->GetErrorString() ); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } } - return result; + return !errorsEncountered; } void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) { // if can read alignment from reader, store in cache - // N.B. - lazy building of alignment's char data, - // only populated on demand by sorting merger or client call to GetNextAlignment() + // + // N.B. - lazy building of alignment's char data - populated only: + // automatically by alignment cache to maintain its sorting OR + // on demand from client call to future call to GetNextAlignment() + if ( reader->GetNextAlignmentCore(*alignment) ) - m_alignmentCache->Add(MergeItem(reader, alignment)); + m_alignmentCache->Add( MergeItem(reader, alignment) ); +} + +void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const { + static const string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; } // sets the index caching mode on the readers @@ -553,12 +654,8 @@ bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { BamReader* reader = item.Reader; if ( reader == 0 ) continue; - // attempt to set BamReader's region of interest - if ( !reader->SetRegion(region) ) { - cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to " - << region.LeftRefID << ":" << region.LeftPosition << ".." - << region.RightRefID << ":" << region.RightPosition << endl; - } + // set region of interest + reader->SetRegion(region); } // return status of cache update @@ -572,7 +669,7 @@ bool BamMultiReaderPrivate::UpdateAlignmentCache(void) { if ( m_alignmentCache == 0 ) { m_alignmentCache = CreateAlignmentCache(); if ( m_alignmentCache == 0 ) { - // set error string + SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache"); return false; } } @@ -603,8 +700,10 @@ bool BamMultiReaderPrivate::UpdateAlignmentCache(void) { // the multireader is undefined, so we force program exit. bool BamMultiReaderPrivate::ValidateReaders(void) const { - // skip if no readers opened - if ( m_readers.empty() ) + m_errorString.clear(); + + // skip if 0 or 1 readers opened + if ( m_readers.empty() || (m_readers.size() == 1) ) return true; // retrieve first reader @@ -635,10 +734,10 @@ bool BamMultiReaderPrivate::ValidateReaders(void) const { // check compatible sort order if ( currentReaderSortOrder != firstReaderSortOrder ) { - // error string - cerr << "BamMultiReader ERROR: mismatched sort order in " << reader->GetFilename() - << ", expected " << firstReaderSortOrder - << ", but found " << currentReaderSortOrder << endl; + const string message = string("mismatched sort order in ") + reader->GetFilename() + + ", expected " + firstReaderSortOrder + + ", but found " + currentReaderSortOrder; + SetErrorString("BamMultiReader::ValidateReaders", message); return false; } @@ -656,9 +755,11 @@ bool BamMultiReaderPrivate::ValidateReaders(void) const { if ( (currentReaderRefCount != firstReaderRefCount) || (firstReaderRefSize != currentReaderRefSize) ) { - cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstReaderRefCount - << " reference sequences but only found " << currentReaderRefCount << endl; + stringstream s(""); + s << "mismatched reference count in " << reader->GetFilename() + << ", expected " << firstReaderRefCount + << ", but found " << currentReaderRefCount; + SetErrorString("BamMultiReader::ValidateReaders", s.str()); return false; } @@ -672,27 +773,30 @@ bool BamMultiReaderPrivate::ValidateReaders(void) const { if ( (firstRef.RefName != currentRef.RefName) || (firstRef.RefLength != currentRef.RefLength) ) { - cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename() - << " expected: " << endl; + stringstream s(""); + s << "mismatched references found in" << reader->GetFilename() + << "expected: " << endl; // print first reader's reference data RefVector::const_iterator refIter = firstReaderRefData.begin(); RefVector::const_iterator refEnd = firstReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { const RefData& entry = (*refIter); - cerr << entry.RefName << " " << entry.RefLength << endl; + stringstream s(""); + s << entry.RefName << " " << endl; } - cerr << "but found: " << endl; + s << "but found: " << endl; // print current reader's reference data refIter = currentReaderRefData.begin(); refEnd = currentReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { const RefData& entry = (*refIter); - cerr << entry.RefName << " " << entry.RefLength << endl; + s << entry.RefName << " " << entry.RefLength << endl; } + SetErrorString("BamMultiReader::ValidateReaders", s.str()); return false; } diff --git a/src/api/internal/BamMultiReader_p.h b/src/api/internal/BamMultiReader_p.h index 4c4e5ea..bb94db6 100644 --- a/src/api/internal/BamMultiReader_p.h +++ b/src/api/internal/BamMultiReader_p.h @@ -2,7 +2,7 @@ // BamMultiReader_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 3 October 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -44,9 +44,8 @@ class BamMultiReaderPrivate { public: // file operations - void Close(void); - void CloseFile(const std::string& filename); - void CloseFiles(const std::vector& filenames); + bool Close(void); + bool CloseFile(const std::string& filename); const std::vector Filenames(void) const; bool Jump(int refID, int position = 0); bool Open(const std::vector& filenames); @@ -73,13 +72,18 @@ class BamMultiReaderPrivate { bool OpenIndexes(const std::vector& indexFilenames); void SetIndexCacheMode(const BamIndex::IndexCacheMode mode); + // error handling + std::string GetErrorString(void) const; + // 'internal' methods public: + bool CloseFiles(const std::vector& filenames); IMultiMerger* CreateAlignmentCache(void) const; bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData); bool RewindReaders(void); void SaveNextAlignment(BamReader* reader, BamAlignment* alignment); + void SetErrorString(const std::string& where, const std::string& what) const; // bool UpdateAlignmentCache(void); bool ValidateReaders(void) const; @@ -87,6 +91,7 @@ class BamMultiReaderPrivate { public: std::vector m_readers; IMultiMerger* m_alignmentCache; + mutable std::string m_errorString; }; } // namespace Internal diff --git a/src/api/internal/BamRandomAccessController_p.cpp b/src/api/internal/BamRandomAccessController_p.cpp index a599f7c..7f5c350 100644 --- a/src/api/internal/BamRandomAccessController_p.cpp +++ b/src/api/internal/BamRandomAccessController_p.cpp @@ -2,19 +2,21 @@ // BamRandomAccessController_p.cpp (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 5 April 2011(DB) +// Last modified: 6 October 2011(DB) // --------------------------------------------------------------------------- // Manages random access operations in a BAM file // ************************************************************************** #include +#include #include #include #include using namespace BamTools; using namespace BamTools::Internal; -#include +#include +#include using namespace std; BamRandomAccessController::BamRandomAccessController(void) @@ -133,8 +135,10 @@ void BamRandomAccessController::Close(void) { } void BamRandomAccessController::ClearIndex(void) { - delete m_index; - m_index = 0; + if ( m_index ) { + delete m_index; + m_index = 0; + } } void BamRandomAccessController::ClearRegion(void) { @@ -143,23 +147,30 @@ void BamRandomAccessController::ClearRegion(void) { } bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, - const BamIndex::IndexType& type) { - + const BamIndex::IndexType& type) +{ // skip if reader is invalid - if ( reader == 0 ) + assert(reader); + if ( !reader->IsOpen() ) { + SetErrorString("BamRandomAccessController::CreateIndex", + "cannot create index for unopened reader"); return false; + } // create new index of requested type BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader); if ( newIndex == 0 ) { - cerr << "BamRandomAccessController ERROR: could not create index of type " << type << endl; + stringstream s(""); + s << "could not create index of type: " << type; + SetErrorString("BamRandomAccessController::CreateIndex", s.str()); return false; } // attempt to build index from current BamReader file if ( !newIndex->Create() ) { - cerr << "BamRandomAccessController ERROR: could not create index for BAM file: " - << reader->Filename() << endl; + const string indexError = newIndex->GetErrorString(); + const string message = "could not create index: \n\t" + indexError; + SetErrorString("BamRandomAccessController::CreateIndex", message); return false; } @@ -171,6 +182,10 @@ bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, return true; } +string BamRandomAccessController::GetErrorString(void) const { + return m_errorString; +} + bool BamRandomAccessController::HasIndex(void) const { return ( m_index != 0 ); } @@ -187,13 +202,13 @@ bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType) { // look up index filename, deferring to preferredType if possible + assert(reader); const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType); // if no index file found (of any type) if ( indexFilename.empty() ) { - cerr << "BamRandomAccessController WARNING: " - << "could not find index file for BAM: " - << reader->Filename() << endl; + const string message = string("could not find index file for:") + reader->Filename(); + SetErrorString("BamRandomAccessController::LocateIndex", message); return false; } @@ -206,7 +221,8 @@ bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReader // attempt create new index of type based on filename BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader); if ( index == 0 ) { - cerr << "BamRandomAccessController ERROR: could not create index for file: " << indexFilename << endl; + const string message = string("could not open index file: ") + indexFilename; + SetErrorString("BamRandomAccessController::OpenIndex", message); return false; } @@ -215,7 +231,10 @@ bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReader // attempt to load data from index file if ( !index->Load(indexFilename) ) { - cerr << "BamRandomAccessController ERROR: could not load index data from file: " << indexFilename << endl; + const string indexError = index->GetErrorString(); + const string message = string("could not load index data from file: ") + indexFilename + + "\n\t" + indexError; + SetErrorString("BamRandomAccessController::OpenIndex", message); return false; } @@ -228,6 +247,10 @@ bool BamRandomAccessController::RegionHasAlignments(void) const { return m_hasAlignmentsInRegion; } +void BamRandomAccessController::SetErrorString(const string& where, const string& what) { + m_errorString = where + ": " + what; +} + void BamRandomAccessController::SetIndex(BamIndex* index) { if ( m_index ) ClearIndex(); @@ -240,16 +263,16 @@ void BamRandomAccessController::SetIndexCacheMode(const BamIndex::IndexCacheMode m_index->SetCacheMode(mode); } -bool BamRandomAccessController::SetRegion(BamReaderPrivate* reader, - const BamRegion& region, - const int& referenceCount) -{ +bool BamRandomAccessController::SetRegion(const BamRegion& region, const int& referenceCount) { + // store region m_region = region; // cannot jump when no index is available - if ( !HasIndex() ) + if ( !HasIndex() ) { + SetErrorString("BamRandomAccessController", "cannot jump if no index data available"); return false; + } // adjust region as necessary to reflect where data actually begins AdjustRegion(referenceCount); @@ -257,7 +280,7 @@ bool BamRandomAccessController::SetRegion(BamReaderPrivate* reader, // if no data present, return true // * Not an error, but future attempts to access alignments in this region will not return data // Returning true is useful in a BamMultiReader setting where some BAM files may - // lack alignments in regions where other BAMs do have data. + // lack alignments in regions where other files still have data available. if ( !m_hasAlignmentsInRegion ) return true; @@ -267,6 +290,13 @@ bool BamRandomAccessController::SetRegion(BamReaderPrivate* reader, // This covers 'corner case' where a region is requested that lies beyond the last // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core] // will not return data. BamMultiReader will still be able to successfully pull alignments - // from a region from multiple files even if one or more have no data. - return m_index->Jump(m_region, &m_hasAlignmentsInRegion); + // from a region from other files even if this one has no data. + if ( !m_index->Jump(m_region, &m_hasAlignmentsInRegion) ) { + const string indexError = m_index->GetErrorString(); + const string message = string("could not set region\n\t") + indexError; + SetErrorString("BamRandomAccessController::OpenIndex", message); + return false; + } + else + return true; } diff --git a/src/api/internal/BamRandomAccessController_p.h b/src/api/internal/BamRandomAccessController_p.h index 86af28c..ff902b3 100644 --- a/src/api/internal/BamRandomAccessController_p.h +++ b/src/api/internal/BamRandomAccessController_p.h @@ -2,7 +2,7 @@ // BamRandomAccessController_p.h (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 24 February 2011(DB) +// Last modified: 6 October 2011(DB) // --------------------------------------------------------------------------- // Manages random access operations in a BAM file // *************************************************************************** @@ -44,13 +44,10 @@ class BamRandomAccessController { BamRandomAccessController(void); ~BamRandomAccessController(void); - // general interface + // BamRandomAccessController interface public: - void Close(void); - // index operations - public: - // + // index methods void ClearIndex(void); bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type); bool HasIndex(void) const; @@ -60,31 +57,37 @@ class BamRandomAccessController { void SetIndex(BamIndex* index); void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); - // region operations - public: + // region methods void ClearRegion(void); bool HasRegion(void) const; RegionState AlignmentState(const BamAlignment& alignment) const; bool RegionHasAlignments(void) const; - bool SetRegion(BamReaderPrivate* reader, - const BamRegion& region, - const int& referenceCount); + bool SetRegion(const BamRegion& region, const int& referenceCount); - // 'internal' methods - public: + // general methods + void Close(void); + std::string GetErrorString(void) const; + + // internal methods + private: // adjusts requested region if necessary (depending on where data actually begins) void AdjustRegion(const int& referenceCount); + // error-string handling + void SetErrorString(const std::string& where, const std::string& what); // data members private: // index data - BamIndex* m_index; // owns index, not a copy - responsible for deleting + BamIndex* m_index; // owns the index, not a copy - responsible for deleting BamIndex::IndexCacheMode m_indexCacheMode; // region data BamRegion m_region; bool m_hasAlignmentsInRegion; + + // general data + std::string m_errorString; }; } // namespace Internal diff --git a/src/api/internal/BamReader_p.cpp b/src/api/internal/BamReader_p.cpp index 65b19f3..7707017 100644 --- a/src/api/internal/BamReader_p.cpp +++ b/src/api/internal/BamReader_p.cpp @@ -2,13 +2,14 @@ // BamReader_p.cpp (c) 2009 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 May 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** #include #include +#include #include #include #include @@ -19,6 +20,7 @@ using namespace BamTools; using namespace BamTools::Internal; #include +#include #include #include #include @@ -38,24 +40,52 @@ BamReaderPrivate::~BamReaderPrivate(void) { } // closes the BAM file -void BamReaderPrivate::Close(void) { +bool BamReaderPrivate::Close(void) { - // clear header & reference data + // clear BAM metadata m_references.clear(); m_header.Clear(); - // close internal - m_randomAccessController.Close(); - m_stream.Close(); - // clear filename m_filename.clear(); + + // close random access controller + m_randomAccessController.Close(); + + // if stream is open, attempt close + if ( IsOpen() ) { + try { + m_stream.Close(); + } catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("encountered error closing BAM file: \n\t") + streamError; + SetErrorString("BamReader::Close", message); + return false; + } + } + + // return success + return true; } // creates an index file of requested type on current BAM file bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) { - if ( !IsOpen() ) return false; - return m_randomAccessController.CreateIndex(this, type); + + // skip if BAM file not open + if ( !IsOpen() ) { + SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file"); + return false; + } + + // attempt to create index + if ( m_randomAccessController.CreateIndex(this, type) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not create index: \n\t") + bracError; + SetErrorString("BamReader::CreateIndex", message); + return false; + } } // return path & filename of current BAM file @@ -63,6 +93,10 @@ const string BamReaderPrivate::Filename(void) const { return m_filename; } +string BamReaderPrivate::GetErrorString(void) const { + return m_errorString; +} + // return header data as std::string string BamReaderPrivate::GetHeaderText(void) const { return m_header.ToString(); @@ -83,7 +117,14 @@ bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { alignment.Filename = m_filename; // return success/failure of parsing char data - return alignment.BuildCharData(); + if ( alignment.BuildCharData() ) + return true; + else { + const string alError = alignment.GetErrorString(); + const string message = string("could not populate alignment data: \n\t") + alError; + SetErrorString("BamReader::GetNextAlignment", message); + return false; + } } // no valid alignment found @@ -96,43 +137,52 @@ bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { // useful for operations requiring ONLY positional or other alignment-related information bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) { - // skip if region is set but has no alignments - if ( m_randomAccessController.HasRegion() && - !m_randomAccessController.RegionHasAlignments() ) - { - return false; - } - - // if can't read next alignment - if ( !LoadNextAlignment(alignment) ) - return false; - - // check alignment's region-overlap state - BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); - - // if alignment starts after region, no need to keep reading - if ( state == BamRandomAccessController::AfterRegion ) - return false; + try { - // read until overlap is found - while ( state != BamRandomAccessController::OverlapsRegion ) { + // skip if region is set but has no alignments + if ( m_randomAccessController.HasRegion() && + !m_randomAccessController.RegionHasAlignments() ) + { + return false; + } // if can't read next alignment if ( !LoadNextAlignment(alignment) ) return false; // check alignment's region-overlap state - state = m_randomAccessController.AlignmentState(alignment); + BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); // if alignment starts after region, no need to keep reading if ( state == BamRandomAccessController::AfterRegion ) return false; - } - // if we get here, we found the next 'valid' alignment - // (e.g. overlaps current region if one was set, simply the next alignment if not) - alignment.SupportData.HasCoreOnly = true; - return true; + // read until overlap is found + while ( state != BamRandomAccessController::OverlapsRegion ) { + + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; + + // check alignment's region-overlap state + state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; + } + + // if we get here, we found the next 'valid' alignment + // (e.g. overlaps current region if one was set, simply the next alignment if not) + alignment.SupportData.HasCoreOnly = true; + return true; + + } catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("encountered error reading BAM alignment: \n\t") + streamError; + SetErrorString("BamReader::GetNextAlignmentCore", message); + return false; + } } int BamReaderPrivate::GetReferenceCount(void) const { @@ -168,8 +218,8 @@ bool BamReaderPrivate::IsOpen(void) const { } // load BAM header data -bool BamReaderPrivate::LoadHeaderData(void) { - return m_header.Load(&m_stream); +void BamReaderPrivate::LoadHeaderData(void) { + m_header.Load(&m_stream); } // populates BamAlignment with alignment data under file pointer, returns success/fail @@ -180,7 +230,8 @@ bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { m_stream.Read(buffer, sizeof(uint32_t)); alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer); if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength); - if ( alignment.SupportData.BlockLength == 0 ) return false; + if ( alignment.SupportData.BlockLength == 0 ) + return false; // read in core alignment data, make sure the right size of data was read char x[Constants::BAM_CORE_SIZE]; @@ -217,12 +268,12 @@ bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { // read in character data - make sure proper data size was read bool readCharDataOK = false; const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; - char* allCharData = (char*)calloc(sizeof(char), dataLength); + RaiiBuffer allCharData(dataLength); - if ( m_stream.Read(allCharData, dataLength) == (signed int)dataLength ) { + if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) { // store 'allCharData' in supportData structure - alignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); + alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength); // set success flag readCharDataOK = true; @@ -231,7 +282,7 @@ bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, // even when GetNextAlignmentCore() is called const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength; - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset); CigarOp op; alignment.CigarData.clear(); alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations); @@ -249,8 +300,7 @@ bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { } } - // clean up & return parsing success/failure - free(allCharData); + // return success/failure return readCharDataOK; } @@ -271,22 +321,19 @@ bool BamReaderPrivate::LoadReferenceData(void) { m_stream.Read(buffer, sizeof(uint32_t)); uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer); if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength); - char* refName = (char*)calloc(refNameLength, 1); + RaiiBuffer refName(refNameLength); // get reference name and reference sequence length - m_stream.Read(refName, refNameLength); + m_stream.Read(refName.Buffer, refNameLength); m_stream.Read(buffer, sizeof(int32_t)); int32_t refLength = BamTools::UnpackSignedInt(buffer); if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength); // store data for reference RefData aReference; - aReference.RefName = (string)((const char*)refName); + aReference.RefName = (string)((const char*)refName.Buffer); aReference.RefLength = refLength; m_references.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); } // return success @@ -294,70 +341,106 @@ bool BamReaderPrivate::LoadReferenceData(void) { } bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) { - return m_randomAccessController.LocateIndex(this, preferredType); + + if ( m_randomAccessController.LocateIndex(this, preferredType) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not locate index: \n\t") + bracError; + SetErrorString("BamReader::LocateIndex", message); + return false; + } } // opens BAM file (and index) bool BamReaderPrivate::Open(const string& filename) { - // close current BAM file if open - if ( m_stream.IsOpen ) - Close(); + bool result; - // attempt to open BgzfStream for reading - if ( !m_stream.Open(filename, "rb") ) { - cerr << "BamReader ERROR: Could not open BGZF stream for " << filename << endl; - return false; - } + try { - // attempt to load header data - if ( !LoadHeaderData() ) { - cerr << "BamReader ERROR: Could not load header data for " << filename << endl; + // make sure we're starting with fresh state Close(); - return false; - } - // attempt to load reference data - if ( !LoadReferenceData() ) { - cerr << "BamReader ERROR: Could not load reference data for " << filename << endl; - Close(); + // open BgzfStream + m_stream.Open(filename, "rb"); + assert(m_stream); + + // load BAM metadata + LoadHeaderData(); + LoadReferenceData(); + + // store filename & offset of first alignment + m_filename = filename; + m_alignmentsBeginOffset = m_stream.Tell(); + + // set flag + result = true; + + } catch ( BamException& e ) { + const string error = e.what(); + const string message = string("could not open file: ") + filename + + "\n\t" + error; + SetErrorString("BamReader::Open", message); return false; } - // if all OK, store filename & offset of first alignment - m_filename = filename; - m_alignmentsBeginOffset = m_stream.Tell(); - - // return success - return true; + // return success/failure + return result; } bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { - return m_randomAccessController.OpenIndex(indexFilename, this); + + if ( m_randomAccessController.OpenIndex(indexFilename, this) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not open index: \n\t") + bracError; + SetErrorString("BamReader::OpenIndex", message); + return false; + } } // returns BAM file pointer to beginning of alignment data bool BamReaderPrivate::Rewind(void) { - // attempt rewind to first alignment - if ( !m_stream.Seek(m_alignmentsBeginOffset) ) - return false; - - // verify that we can read first alignment - BamAlignment al; - if ( !LoadNextAlignment(al) ) - return false; - // reset region m_randomAccessController.ClearRegion(); - // rewind back to beginning of first alignment - // return success/fail of seek - return m_stream.Seek(m_alignmentsBeginOffset); + // return status of seeking back to first alignment + if ( Seek(m_alignmentsBeginOffset) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("could not rewind: \n\t") + currentError; + SetErrorString("BamReader::Rewind", message); + return false; + } } bool BamReaderPrivate::Seek(const int64_t& position) { - return m_stream.Seek(position); + + // skip if BAM file not open + if ( !IsOpen() ) { + SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file"); + return false; + } + + try { + m_stream.Seek(position); + return true; + } + catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("could not seek in BAM file: \n\t") + streamError; + SetErrorString("BamReader::Seek", message); + return false; + } +} + +void BamReaderPrivate::SetErrorString(const string& where, const string& what) { + static const string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; } void BamReaderPrivate::SetIndex(BamIndex* index) { @@ -372,7 +455,15 @@ void BamReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { // sets current region & attempts to jump to it // returns success/failure bool BamReaderPrivate::SetRegion(const BamRegion& region) { - return m_randomAccessController.SetRegion(this, region, m_references.size()); + + if ( m_randomAccessController.SetRegion(region, m_references.size()) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not set region: \n\t") + bracError; + SetErrorString("BamReader::SetRegion", message); + return false; + } } int64_t BamReaderPrivate::Tell(void) const { diff --git a/src/api/internal/BamReader_p.h b/src/api/internal/BamReader_p.h index 2901e54..ccc3835 100644 --- a/src/api/internal/BamReader_p.h +++ b/src/api/internal/BamReader_p.h @@ -2,7 +2,7 @@ // BamReader_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 5 April 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -43,7 +43,7 @@ class BamReaderPrivate { public: // file operations - void Close(void); + bool Close(void); const std::string Filename(void) const; bool IsOpen(void) const; bool Open(const std::string& filename); @@ -69,13 +69,17 @@ class BamReaderPrivate { void SetIndex(BamIndex* index); void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + // error handling + std::string GetErrorString(void) const; + void SetErrorString(const std::string& where, const std::string& what); + // internal methods, but available as a BamReaderPrivate 'interface' // // these methods should only be used by BamTools::Internal classes // (currently only used by the BamIndex subclasses) public: // retrieves header text from BAM file - bool LoadHeaderData(void); + void LoadHeaderData(void); // retrieves BAM alignment under file pointer // (does no overlap checking or character data parsing) bool LoadNextAlignment(BamAlignment& alignment); @@ -104,6 +108,9 @@ class BamReaderPrivate { BamHeader m_header; BamRandomAccessController m_randomAccessController; BgzfStream m_stream; + + // error handling + std::string m_errorString; }; } // namespace Internal diff --git a/src/api/internal/BamStandardIndex_p.cpp b/src/api/internal/BamStandardIndex_p.cpp index 43f12eb..ad21fec 100644 --- a/src/api/internal/BamStandardIndex_p.cpp +++ b/src/api/internal/BamStandardIndex_p.cpp @@ -2,12 +2,13 @@ // BamStandardIndex.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 24 June 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** #include +#include #include #include using namespace BamTools; @@ -17,10 +18,13 @@ using namespace BamTools::Internal; #include #include #include -#include +#include using namespace std; +// ----------------------------------- // static BamStandardIndex constants +// ----------------------------------- + const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1 const int BamStandardIndex::BAM_LIDX_SHIFT = 14; const string BamStandardIndex::BAI_EXTENSION = ".bai"; @@ -29,12 +33,36 @@ const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2; const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t); const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t); +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BamStandardIndex::RaiiWrapper::RaiiWrapper(void) + : IndexStream(0) + , Buffer(0) +{ } + +BamStandardIndex::RaiiWrapper::~RaiiWrapper(void) { + + if ( IndexStream ) { + fclose(IndexStream); + IndexStream = 0; + } + + if ( Buffer ) { + delete[] Buffer; + Buffer = 0; + } +} + +// --------------------------------- +// BamStandardIndex implementation +// --------------------------------- + // ctor BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader) : BamIndex(reader) - , m_indexStream(0) , m_cacheMode(BamIndex::LimitedIndexCaching) - , m_buffer(0) , m_bufferLength(0) { m_isBigEndian = BamTools::SystemIsBigEndian(); @@ -45,14 +73,14 @@ BamStandardIndex::~BamStandardIndex(void) { CloseFile(); } -bool BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) { +void BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) { // retrieve references from reader const RefVector& references = m_reader->GetReferenceData(); // make sure left-bound position is valid if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) - return false; + throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested"); // set region 'begin' begin = (unsigned int)region.LeftPosition; @@ -64,9 +92,6 @@ bool BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, ui // otherwise, set region 'end' to last reference base else end = (unsigned int)references.at(region.LeftRefID).RefLength - 1; - - // return success - return true; } void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, @@ -85,14 +110,13 @@ void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); } } -bool BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, +void BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, const uint64_t& minOffset, set& candidateBins, vector& offsets) { - // attempt seek to first bin - if ( !Seek(refSummary.FirstBinFilePosition, SEEK_SET) ) - return false; + // seek to first bin + Seek(refSummary.FirstBinFilePosition, SEEK_SET); // iterate over reference bins uint32_t binId; @@ -101,8 +125,7 @@ bool BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refS for ( int i = 0; i < refSummary.NumBins; ++i ) { // read bin contents (if successful, alignment chunks are now in m_buffer) - if ( !ReadBinIntoBuffer(binId, numAlignmentChunks) ) - return false; + ReadBinIntoBuffer(binId, numAlignmentChunks); // see if bin is a 'candidate bin' candidateBinIter = candidateBins.find(binId); @@ -114,17 +137,17 @@ bool BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refS // otherwise, check bin's contents against for overlap else { - unsigned int offset = 0; + size_t offset = 0; uint64_t chunkStart; uint64_t chunkStop; // iterate over alignment chunks - for (int j = 0; j < numAlignmentChunks; ++j ) { + for ( int j = 0; j < numAlignmentChunks; ++j ) { // read chunk start & stop from buffer - memcpy((char*)&chunkStart, m_buffer+offset, sizeof(uint64_t)); + memcpy((char*)&chunkStart, Resources.Buffer+offset, sizeof(uint64_t)); offset += sizeof(uint64_t); - memcpy((char*)&chunkStop, m_buffer+offset, sizeof(uint64_t)); + memcpy((char*)&chunkStop, Resources.Buffer+offset, sizeof(uint64_t)); offset += sizeof(uint64_t); // swap endian-ness if necessary @@ -147,9 +170,6 @@ bool BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refS break; } } - - // return success - return true; } uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary, @@ -178,10 +198,10 @@ void BamStandardIndex::CheckBufferSize(char*& buffer, delete[] buffer; buffer = new char[bufferLength]; } - } catch ( std::bad_alloc ) { - cerr << "BamStandardIndex ERROR: out of memory when allocating " - << requestedBytes << " byes" << endl; - exit(1); + } catch ( std::bad_alloc& ) { + stringstream s(""); + s << "out of memory when allocating " << requestedBytes << " bytes"; + throw BamException("BamStandardIndex::CheckBufferSize", s.str()); } } @@ -195,31 +215,24 @@ void BamStandardIndex::CheckBufferSize(unsigned char*& buffer, delete[] buffer; buffer = new unsigned char[bufferLength]; } - } catch ( std::bad_alloc ) { - cerr << "BamStandardIndex ERROR: out of memory when allocating " - << requestedBytes << " byes" << endl; - exit(1); + } catch ( std::bad_alloc& ) { + stringstream s(""); + s << "out of memory when allocating " << requestedBytes << " bytes"; + throw BamException("BamStandardIndex::CheckBufferSize", s.str()); } } -bool BamStandardIndex::CheckMagicNumber(void) { +void BamStandardIndex::CheckMagicNumber(void) { // check 'magic number' to see if file is BAI index char magic[4]; - size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); - if ( elementsRead != 4 ) { - cerr << "BamStandardIndex ERROR: could not read format 'magic number'" << endl; - return false; - } + const size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream); + if ( elementsRead != 4 ) + throw BamException("BamStandardIndex::CheckMagicNumber", "could not read BAI magic number"); // compare to expected value - if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 ) { - cerr << "BamStandardIndex ERROR: invalid format" << endl; - return false; - } - - // otherwise OK - return true; + if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 ) + throw BamException("BamStandardIndex::CheckMagicNumber", "invalid BAI magic number"); } void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) { @@ -231,169 +244,176 @@ void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) { void BamStandardIndex::CloseFile(void) { // close file stream - if ( IsFileOpen() ) - fclose(m_indexStream); + if ( IsFileOpen() ) { + fclose(Resources.IndexStream); + Resources.IndexStream = 0; + } // clear index file summary data m_indexFileSummary.clear(); // clean up I/O buffer - delete[] m_buffer; - m_buffer = 0; + delete[] Resources.Buffer; + Resources.Buffer = 0; m_bufferLength = 0; } // builds index from associated BAM file & writes out to index file bool BamStandardIndex::Create(void) { - // return false if BamReader is invalid or not open + // skip if BamReader is invalid or not open if ( m_reader == 0 || !m_reader->IsOpen() ) { - cerr << "BamStandardIndex ERROR: BamReader is not open" - << ", aborting index creation" << endl; + SetErrorString("BamStandardIndex::Create", "could not create index: reader is not open"); return false; } // rewind BamReader if ( !m_reader->Rewind() ) { - cerr << "BamStandardIndex ERROR: could not rewind BamReader to create index" - << ", aborting index creation" << endl; + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamStandardIndex::Create", message); return false; } - // open new index file (read & write) - string indexFilename = m_reader->Filename() + Extension(); - if ( !OpenFile(indexFilename, "w+b") ) { - cerr << "BamStandardIndex ERROR: could not open output index file: " << indexFilename - << ", aborting index creation" << endl; - return false; - } + try { - // initialize BaiFileSummary with number of references - const int& numReferences = m_reader->GetReferenceCount(); - ReserveForSummary(numReferences); + // open new index file (read & write) + string indexFilename = m_reader->Filename() + Extension(); + OpenFile(indexFilename, "w+b"); + + // initialize BaiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + ReserveForSummary(numReferences); + + // initialize output file + WriteHeader(); + + // set up bin, ID, offset, & coordinate markers + const uint32_t defaultValue = 0xffffffffu; + uint32_t currentBin = defaultValue; + uint32_t lastBin = defaultValue; + int32_t currentRefID = defaultValue; + int32_t lastRefID = defaultValue; + uint64_t currentOffset = (uint64_t)m_reader->Tell(); + uint64_t lastOffset = currentOffset; + int32_t lastPosition = defaultValue; + + // iterate through alignments in BAM file + BamAlignment al; + BaiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { + + // changed to new reference + if ( lastRefID != al.RefID ) { + + // if not first reference, save previous reference data + if ( lastRefID != (int32_t)defaultValue ) { + + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but *NOT* including) lastRefID & al.RefID + for ( int i = lastRefID+1; i < al.RefID; ++i ) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + + // update bin markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; + } - // initialize output file - bool createdOk = true; - createdOk &= WriteHeader(); - - // set up bin, ID, offset, & coordinate markers - const uint32_t defaultValue = 0xffffffffu; - uint32_t currentBin = defaultValue; - uint32_t lastBin = defaultValue; - int32_t currentRefID = defaultValue; - int32_t lastRefID = defaultValue; - uint64_t currentOffset = (uint64_t)m_reader->Tell(); - uint64_t lastOffset = currentOffset; - int32_t lastPosition = defaultValue; - - // iterate through alignments in BAM file - BamAlignment al; - BaiReferenceEntry refEntry; - while ( m_reader->LoadNextAlignment(al) ) { + // otherwise, this is first pass + // be sure to write any empty references up to (but *NOT* including) current RefID + else { + for ( int i = 0; i < al.RefID; ++i ) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + } - // changed to new reference - if ( lastRefID != al.RefID ) { + // update reference markers + refEntry.ID = al.RefID; + lastRefID = al.RefID; + lastBin = defaultValue; + } - // if not first reference, save previous reference data - if ( lastRefID != (int32_t)defaultValue ) { + // if lastPosition greater than current alignment position - file not sorted properly + else if ( lastPosition > al.Position ) { + stringstream s(""); + s << "BAM file is not properly sorted by coordinate" << endl + << "Current alignment position: " << al.Position + << " < previous alignment position: " << lastPosition + << " on reference ID: " << al.RefID << endl; + SetErrorString("BamStandardIndex::Create", s.str()); + return false; + } - SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); - createdOk &= WriteReferenceEntry(refEntry); - ClearReferenceEntry(refEntry); + // if alignment's ref ID is valid & its bin is not a 'leaf' + if ( (al.RefID >= 0) && (al.Bin < 4681) ) + SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset); - // write any empty references between (but *NOT* including) lastRefID & al.RefID - for ( int i = lastRefID+1; i < al.RefID; ++i ) { - BaiReferenceEntry emptyEntry(i); - createdOk &= WriteReferenceEntry(emptyEntry); - } + // changed to new BAI bin + if ( al.Bin != lastBin ) { - // update bin markers + // if not first bin on reference, save previous bin data + if ( currentBin != defaultValue ) + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + + // update markers currentOffset = lastOffset; currentBin = al.Bin; lastBin = al.Bin; currentRefID = al.RefID; - } - // first pass - // write any empty references up to (but *NOT* including) al.RefID - else { - for ( int i = 0; i < al.RefID; ++i ) { - BaiReferenceEntry emptyEntry(i); - createdOk &= WriteReferenceEntry(emptyEntry); - } + // if invalid RefID, break out + if ( currentRefID < 0 ) + break; } - // update reference markers - refEntry.ID = al.RefID; - lastRefID = al.RefID; - lastBin = defaultValue; - } + // make sure that current file pointer is beyond lastOffset + if ( m_reader->Tell() <= (int64_t)lastOffset ) { + SetErrorString("BamStandardIndex::Create", "calculating offsets failed"); + return false; + } - // if lastPosition greater than current alignment position - file not sorted properly - else if ( lastPosition > al.Position ) { - cerr << "BamStandardIndex ERROR: BAM file is not properly sorted by coordinate" - << ", aborting index creation" - << endl - << "At alignment: " << al.Name - << " : previous position " << lastPosition - << " > this alignment position " << al.Position - << " on reference id: " << al.RefID << endl; - return false; + // update lastOffset & lastPosition + lastOffset = m_reader->Tell(); + lastPosition = al.Position; } - // if alignment's ref ID is valid & its bin is not a 'leaf' - if ( (al.RefID >= 0) && (al.Bin < 4681) ) - SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset); - - // changed to new BAI bin - if ( al.Bin != lastBin ) { - - // if not first bin on reference, save previous bin data - if ( currentBin != defaultValue ) - SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + // after finishing alignments, if any data was read, check: + if ( currentRefID >= 0 ) { - // update markers - currentOffset = lastOffset; - currentBin = al.Bin; - lastBin = al.Bin; - currentRefID = al.RefID; + // store last alignment chunk to its bin, then write last reference entry with data + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + WriteReferenceEntry(refEntry); - // if invalid RefID, break out - if ( currentRefID < 0 ) - break; - } - - // make sure that current file pointer is beyond lastOffset - if ( m_reader->Tell() <= (int64_t)lastOffset ) { - cerr << "BamStandardIndex ERROR: calculating offsets failed" - << ", aborting index creation" << endl; - return false; + // then write any empty references remaining at end of file + for ( int i = currentRefID+1; i < numReferences; ++i ) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } } - // update lastOffset & lastPosition - lastOffset = m_reader->Tell(); - lastPosition = al.Position; + } catch ( BamException& e) { + m_errorString = e.what(); + return false; } - // after finishing alignments, if any data was read, check: - if ( currentRefID >= 0 ) { - - // store last alignment chunk to its bin, then write last reference entry with data - SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); - createdOk &= WriteReferenceEntry(refEntry); - - // then write any empty references remaining at end of file - for ( int i = currentRefID+1; i < numReferences; ++i ) { - BaiReferenceEntry emptyEntry(i); - createdOk &= WriteReferenceEntry(emptyEntry); - } + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamStandardIndex::Create", message); + return false; } - // rewind reader now that we're done building - createdOk &= m_reader->Rewind(); - - // return result - return createdOk; + // return success + return true; } // returns format's file extension @@ -401,11 +421,11 @@ const string BamStandardIndex::Extension(void) { return BamStandardIndex::BAI_EXTENSION; } -bool BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { +void BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { // cannot calculate offsets if unknown/invalid reference ID requested if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) - return false; + throw BamException("BamStandardIndex::GetOffset", "invalid reference ID requested"); // retrieve index summary for left bound reference const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID); @@ -413,10 +433,7 @@ bool BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* // set up region boundaries based on actual BamReader data uint32_t begin; uint32_t end; - if ( !AdjustRegion(region, begin, end) ) { - cerr << "BamStandardIndex ERROR: cannot calculate offsets on invalid region" << endl; - return false; - } + AdjustRegion(region, begin, end); // retrieve all candidate bin IDs for region set candidateBins; @@ -427,18 +444,11 @@ bool BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* const uint64_t& minOffset = CalculateMinOffset(refSummary, begin); // attempt to use reference summary, minOffset, & candidateBins to calculate offsets - // no data should not be error + // no data should not be error, just bail vector offsets; - if ( !CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets) ) { - cerr << "BamStandardIndex ERROR: could not calculate candidate offsets for requested region" << endl; - return false; - } - - // if not candidate offsets are present in the indexed (most likely sparce coverage) - // then silently bail - if( offsets.size() == 0 ) { - return false; - } + CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets); + if ( offsets.empty() ) + return; // ensure that offsets are sorted before processing sort( offsets.begin(), offsets.end() ); @@ -459,9 +469,9 @@ bool BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* // attempt seek to candidate offset const int64_t& candidateOffset = (*offsetIter); if ( !m_reader->Seek(candidateOffset) ) { - cerr << "BamStandardIndex ERROR: could not jump" - << ", there was a problem seeking in BAM file" << endl; - return false; + const string readerError = m_reader->GetErrorString(); + const string message = "could not seek in BAM file: \n\t" + readerError; + throw BamException("BamToolsIndex::GetOffset", message); } // load first available alignment, setting flag to true if data exists @@ -474,13 +484,10 @@ bool BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* } else count = step; } - // seek back to the offset before the 'current offset' (to cover overlaps) + // step back to the offset before the 'current offset' (to make sure we cover overlaps) if ( offsetIter != offsets.begin() ) --offsetIter; offset = (*offsetIter); - - // return succes - return true; } // returns whether reference has alignments or no @@ -492,7 +499,7 @@ bool BamStandardIndex::HasAlignments(const int& referenceID) const { } bool BamStandardIndex::IsFileOpen(void) const { - return ( m_indexStream != 0 ); + return ( Resources.IndexStream != 0 ); } // attempts to use index data to jump to @region, returns success/fail @@ -504,15 +511,18 @@ bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion // clear out flag *hasAlignmentsInRegion = false; - // skip if reader is not valid or is not open - if ( m_reader == 0 || !m_reader->IsOpen() ) + // skip if invalid reader or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamStandardIndex::Jump", "could not jump: reader is not open"); return false; + } // calculate nearest offset to jump to int64_t offset; - if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { - cerr << "BamStandardIndex ERROR: could not jump" - << ", unable to calculate offset for specified region" << endl; + try { + GetOffset(region, offset, hasAlignmentsInRegion); + } catch ( BamException& e ) { + m_errorString = e.what(); return false; } @@ -528,31 +538,24 @@ bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion // loads existing data from file into memory bool BamStandardIndex::Load(const std::string& filename) { - // attempt open index file (read-only) - if ( !OpenFile(filename, "rb") ) { - cerr << "BamStandardIndex ERROR: could not open input index file: " << filename - << ", aborting index load" << endl; - return false; - } + try { - // if invalid format 'magic number', close & return failure - if ( !CheckMagicNumber() ) { - cerr << "BamStandardIndex ERROR: unexpected format for index file: " << filename - << ", aborting index load" << endl; - CloseFile(); - return false; - } + // attempt to open file (read-only) + OpenFile(filename, "rb"); + + // validate format + CheckMagicNumber(); + + // load in-memory summary of index data + SummarizeIndexFile(); - // attempt to load index file summary, return success/failure - if ( !SummarizeIndexFile() ) { - cerr << "BamStandardIndex ERROR: could not generate a summary of index file " << filename - << ", aborting index load" << endl; - CloseFile(); + // return success + return true; + + } catch ( BamException& e ) { + m_errorString = e.what(); return false; } - - // if we get here, index summary is loaded OK - return true; } uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) { @@ -560,13 +563,11 @@ uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSumm // attempt seek to proper index file position const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition + index*BamStandardIndex::SIZEOF_LINEAROFFSET; - if ( !Seek(linearOffsetFilePosition, SEEK_SET) ) - return 0; + Seek(linearOffsetFilePosition, SEEK_SET); // read linear offset from BAI file - uint64_t linearOffset(0); - if ( !ReadLinearOffset(linearOffset) ) - return 0; + uint64_t linearOffset; + ReadLinearOffset(linearOffset); return linearOffset; } @@ -611,82 +612,85 @@ void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) { chunks = mergedChunks; } -bool BamStandardIndex::OpenFile(const std::string& filename, const char* mode) { +void BamStandardIndex::OpenFile(const std::string& filename, const char* mode) { // make sure any previous index file is closed CloseFile(); // attempt to open file - m_indexStream = fopen(filename.c_str(), mode); - return IsFileOpen(); + Resources.IndexStream = fopen(filename.c_str(), mode); + if ( !IsFileOpen() ) { + const string message = string("could not open file: ") + filename; + throw BamException("BamStandardIndex::OpenFile", message); + } } -bool BamStandardIndex::ReadBinID(uint32_t& binId) { - size_t elementsRead = 0; - elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); +void BamStandardIndex::ReadBinID(uint32_t& binId) { + const size_t elementsRead = fread(&binId, sizeof(binId), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(binId); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamStandardIndex::ReadBinID", "could not read BAI bin ID"); } -bool BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) { - - bool readOk = true; +void BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) { // read bin header - readOk &= ReadBinID(binId); - readOk &= ReadNumAlignmentChunks(numAlignmentChunks); + ReadBinID(binId); + ReadNumAlignmentChunks(numAlignmentChunks); // read bin contents const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK; - readOk &= ReadIntoBuffer(bytesRequested); - - // return success/failure - return readOk; + ReadIntoBuffer(bytesRequested); } -bool BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) { +void BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) { // ensure that our buffer is big enough for request - BamStandardIndex::CheckBufferSize(m_buffer, m_bufferLength, bytesRequested); + BamStandardIndex::CheckBufferSize(Resources.Buffer, m_bufferLength, bytesRequested); // read from BAI file stream - size_t bytesRead = fread( m_buffer, sizeof(char), bytesRequested, m_indexStream ); - return ( bytesRead == (size_t)bytesRequested ); + const size_t bytesRead = fread( Resources.Buffer, sizeof(char), bytesRequested, Resources.IndexStream ); + if ( bytesRead != (size_t)bytesRequested ) { + stringstream s(""); + s << "expected to read: " << bytesRequested << " bytes, " + << "but instead read: " << bytesRead; + throw BamException("BamStandardIndex::ReadIntoBuffer", s.str()); + } } -bool BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) { - size_t elementsRead = 0; - elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); +void BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) { + const size_t elementsRead = fread(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_64(linearOffset); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamStandardIndex::ReadLinearOffset", "could not read BAI linear offset"); } -bool BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) { - size_t elementsRead = 0; - elementsRead += fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, m_indexStream); +void BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) { + const size_t elementsRead = fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI chunk count"); } -bool BamStandardIndex::ReadNumBins(int& numBins) { - size_t elementsRead = 0; - elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); +void BamStandardIndex::ReadNumBins(int& numBins) { + const size_t elementsRead = fread(&numBins, sizeof(numBins), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(numBins); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamStandardIndex::ReadNumBins", "could not read BAI bin count"); } -bool BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) { - size_t elementsRead = 0; - elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); +void BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) { + const size_t elementsRead = fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI linear offset count"); } -bool BamStandardIndex::ReadNumReferences(int& numReferences) { - size_t elementsRead = 0; - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); +void BamStandardIndex::ReadNumReferences(int& numReferences) { + const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(numReferences); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamStandardIndex::ReadNumReferences", "could not read reference count"); } void BamStandardIndex::ReserveForSummary(const int& numReferences) { @@ -695,15 +699,13 @@ void BamStandardIndex::ReserveForSummary(const int& numReferences) { } void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap, - const uint32_t& currentBin, - const uint64_t& currentOffset, - const uint64_t& lastOffset) + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset) { // create new alignment chunk BaiAlignmentChunk newChunk(currentOffset, lastOffset); - - // if no entry exists yet for this bin, create one and store alignment chunk BaiBinMap::iterator binIter = binMap.find(currentBin); if ( binIter == binMap.end() ) { @@ -754,8 +756,9 @@ void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& num } // seek to position in index file stream -bool BamStandardIndex::Seek(const int64_t& position, const int& origin) { - return ( fseek64(m_indexStream, position, origin) == 0 ); +void BamStandardIndex::Seek(const int64_t& position, const int& origin) { + if ( fseek64(Resources.IndexStream, position, origin) != 0 ) + throw BamException("BamStandardIndex::Seek", "could not seek in BAI file"); } // change the index caching behavior @@ -764,99 +767,77 @@ void BamStandardIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { // do nothing else here ? cache mode will be ignored from now on, most likely } -bool BamStandardIndex::SkipBins(const int& numBins) { +void BamStandardIndex::SkipBins(const int& numBins) { uint32_t binId; int32_t numAlignmentChunks; - bool skippedOk = true; for (int i = 0; i < numBins; ++i) - skippedOk &= ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored - return skippedOk; + ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored } -bool BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) { +void BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) { const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET; - return ReadIntoBuffer(bytesRequested); + ReadIntoBuffer(bytesRequested); } void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) { sort( linearOffsets.begin(), linearOffsets.end() ); } -bool BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) { +void BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) { // load number of bins int numBins; - if ( !ReadNumBins(numBins) ) - return false; + ReadNumBins(numBins); // store bins summary for this reference refSummary.NumBins = numBins; refSummary.FirstBinFilePosition = Tell(); - // attempt skip reference bins, return success/failure - if ( !SkipBins(numBins) ) - return false; - - // if we get here, bin summarized OK - return true; + // skip this reference's bins + SkipBins(numBins); } -bool BamStandardIndex::SummarizeIndexFile(void) { +void BamStandardIndex::SummarizeIndexFile(void) { // load number of reference sequences int numReferences; - if ( !ReadNumReferences(numReferences) ) - return false; + ReadNumReferences(numReferences); // initialize file summary data ReserveForSummary(numReferences); // iterate over reference entries - bool loadedOk = true; BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i ) - loadedOk &= SummarizeReference(*summaryIter); - - // return result - return loadedOk; + SummarizeReference(*summaryIter); } -bool BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) { +void BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) { // load number of linear offsets int numLinearOffsets; - if ( !ReadNumLinearOffsets(numLinearOffsets) ) - return false; + ReadNumLinearOffsets(numLinearOffsets); // store bin summary data for this reference refSummary.NumLinearOffsets = numLinearOffsets; refSummary.FirstLinearOffsetFilePosition = Tell(); // skip linear offsets in index file - if ( !SkipLinearOffsets(numLinearOffsets) ) - return false; - - // if get here, linear offsets summarized OK - return true; + SkipLinearOffsets(numLinearOffsets); } -bool BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) { - - bool loadedOk = true; - loadedOk &= SummarizeBins(refSummary); - loadedOk &= SummarizeLinearOffsets(refSummary); - return loadedOk; +void BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) { + SummarizeBins(refSummary); + SummarizeLinearOffsets(refSummary); } // return position of file pointer in index file stream int64_t BamStandardIndex::Tell(void) const { - return ftell64(m_indexStream); + return ftell64(Resources.IndexStream); } -bool BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) { - - size_t elementsWritten = 0; +void BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) { // localize alignment chunk offsets uint64_t start = chunk.Start; @@ -869,92 +850,81 @@ bool BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) { } // write to index file - elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); - elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); - - // return success/failure of write - return ( elementsWritten == 2 ); + size_t elementsWritten = 0; + elementsWritten += fwrite(&start, sizeof(start), 1, Resources.IndexStream); + elementsWritten += fwrite(&stop, sizeof(stop), 1, Resources.IndexStream); + if ( elementsWritten != 2 ) + throw BamException("BamStandardIndex::WriteAlignmentChunk", "could not write BAI alignment chunk"); } -bool BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) { +void BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) { // make sure chunks are merged (simplified) before writing & saving summary MergeAlignmentChunks(chunks); - size_t elementsWritten = 0; - // write chunks int32_t chunkCount = chunks.size(); if ( m_isBigEndian ) SwapEndian_32(chunkCount); - elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); + const size_t elementsWritten = fwrite(&chunkCount, sizeof(chunkCount), 1, Resources.IndexStream); + if ( elementsWritten != 1 ) + throw BamException("BamStandardIndex::WriteAlignmentChunks", "could not write BAI chunk count"); // iterate over chunks - bool chunksOk = true; BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin(); BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end(); for ( ; chunkIter != chunkEnd; ++chunkIter ) - chunksOk &= WriteAlignmentChunk( (*chunkIter) ); - - // return success/failure of write - return ( (elementsWritten == 1) && chunksOk ); + WriteAlignmentChunk( (*chunkIter) ); } -bool BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) { - - size_t elementsWritten = 0; +void BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) { // write BAM bin ID uint32_t binKey = binId; if ( m_isBigEndian ) SwapEndian_32(binKey); - elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); + const size_t elementsWritten = fwrite(&binKey, sizeof(binKey), 1, Resources.IndexStream); + if ( elementsWritten != 1 ) + throw BamException("BamStandardIndex::WriteBin", "could not write bin ID"); // write bin's alignment chunks - bool chunksOk = WriteAlignmentChunks(chunks); - - // return success/failure of write - return ( (elementsWritten == 1) && chunksOk ); + WriteAlignmentChunks(chunks); } -bool BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) { - - size_t elementsWritten = 0; +void BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) { // write number of bins int32_t binCount = bins.size(); if ( m_isBigEndian ) SwapEndian_32(binCount); - elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); + const size_t elementsWritten = fwrite(&binCount, sizeof(binCount), 1, Resources.IndexStream); + if ( elementsWritten != 1 ) + throw BamException("BamStandardIndex::WriteBins", "could not write bin count"); // save summary for reference's bins SaveBinsSummary(refId, bins.size()); // iterate over bins - bool binsOk = true; BaiBinMap::iterator binIter = bins.begin(); BaiBinMap::iterator binEnd = bins.end(); for ( ; binIter != binEnd; ++binIter ) - binsOk &= WriteBin( (*binIter).first, (*binIter).second ); - - // return success/failure of write - return ( (elementsWritten == 1) && binsOk ); + WriteBin( (*binIter).first, (*binIter).second ); } -bool BamStandardIndex::WriteHeader(void) { +void BamStandardIndex::WriteHeader(void) { size_t elementsWritten = 0; // write magic number - elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, m_indexStream); + elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, Resources.IndexStream); // write number of reference sequences int32_t numReferences = m_indexFileSummary.size(); if ( m_isBigEndian ) SwapEndian_32(numReferences); - elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); - // return success/failure of write - return (elementsWritten == 5); + if ( elementsWritten != 5 ) + throw BamException("BamStandardIndex::WriteHeader", "could not write BAI header"); } -bool BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) { +void BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) { // make sure linear offsets are sorted before writing & saving summary SortLinearOffsets(linearOffsets); @@ -964,7 +934,7 @@ bool BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVecto // write number of linear offsets int32_t offsetCount = linearOffsets.size(); if ( m_isBigEndian ) SwapEndian_32(offsetCount); - elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); + elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, Resources.IndexStream); // save summary for reference's linear offsets SaveLinearOffsetsSummary(refId, linearOffsets.size()); @@ -977,16 +947,14 @@ bool BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVecto // write linear offset uint64_t linearOffset = (*offsetIter); if ( m_isBigEndian ) SwapEndian_64(linearOffset); - elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream); } - // return success/failure of write - return ( elementsWritten == (size_t)(linearOffsets.size() + 1) ); + if ( elementsWritten != (linearOffsets.size() + 1) ) + throw BamException("BamStandardIndex::WriteLinearOffsets", "could not write BAI linear offsets"); } -bool BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) { - bool refOk = true; - refOk &= WriteBins(refEntry.ID, refEntry.Bins); - refOk &= WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets); - return refOk; +void BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) { + WriteBins(refEntry.ID, refEntry.Bins); + WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets); } diff --git a/src/api/internal/BamStandardIndex_p.h b/src/api/internal/BamStandardIndex_p.h index 3fbc777..1bd36c8 100644 --- a/src/api/internal/BamStandardIndex_p.h +++ b/src/api/internal/BamStandardIndex_p.h @@ -2,7 +2,7 @@ // BamStandardIndex.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 24 June 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** @@ -128,17 +128,18 @@ class BamStandardIndex : public BamIndex { // returns format's file extension static const std::string Extension(void); - // internal file ops + // internal methods private: - bool CheckMagicNumber(void); + + // index file ops + void CheckMagicNumber(void); void CloseFile(void); bool IsFileOpen(void) const; - bool OpenFile(const std::string& filename, const char* mode); - bool Seek(const int64_t& position, const int& origin); + void OpenFile(const std::string& filename, const char* mode); + void Seek(const int64_t& position, const int& origin); int64_t Tell(void) const; - // internal BAI index building methods - private: + // BAI index building methods void ClearReferenceEntry(BaiReferenceEntry& refEntry); void SaveAlignmentChunkToBin(BaiBinMap& binMap, const uint32_t& currentBin, @@ -149,66 +150,68 @@ class BamStandardIndex : public BamIndex { const int& alignmentStopPosition, const uint64_t& lastOffset); - // internal random-access methods - private: - bool AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); + // random-access methods + void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); void CalculateCandidateBins(const uint32_t& begin, const uint32_t& end, std::set& candidateBins); - bool CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, const uint64_t& minOffset, std::set& candidateBins, std::vector& offsets); uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); - bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); - // internal BAI summary (create/load) methods - private: + // BAI summary (create/load) methods void ReserveForSummary(const int& numReferences); void SaveBinsSummary(const int& refId, const int& numBins); void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets); - bool SkipBins(const int& numBins); - bool SkipLinearOffsets(const int& numLinearOffsets); - bool SummarizeBins(BaiReferenceSummary& refSummary); - bool SummarizeIndexFile(void); - bool SummarizeLinearOffsets(BaiReferenceSummary& refSummary); - bool SummarizeReference(BaiReferenceSummary& refSummary); - - // internal BAI full index input methods - private: - bool ReadBinID(uint32_t& binId); - bool ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); - bool ReadIntoBuffer(const unsigned int& bytesRequested); - bool ReadLinearOffset(uint64_t& linearOffset); - bool ReadNumAlignmentChunks(int& numAlignmentChunks); - bool ReadNumBins(int& numBins); - bool ReadNumLinearOffsets(int& numLinearOffsets); - bool ReadNumReferences(int& numReferences); - - // internal BAI full index output methods - private: + void SkipBins(const int& numBins); + void SkipLinearOffsets(const int& numLinearOffsets); + void SummarizeBins(BaiReferenceSummary& refSummary); + void SummarizeIndexFile(void); + void SummarizeLinearOffsets(BaiReferenceSummary& refSummary); + void SummarizeReference(BaiReferenceSummary& refSummary); + + // BAI full index input methods + void ReadBinID(uint32_t& binId); + void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); + void ReadIntoBuffer(const unsigned int& bytesRequested); + void ReadLinearOffset(uint64_t& linearOffset); + void ReadNumAlignmentChunks(int& numAlignmentChunks); + void ReadNumBins(int& numBins); + void ReadNumLinearOffsets(int& numLinearOffsets); + void ReadNumReferences(int& numReferences); + + // BAI full index output methods void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks); void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets); - bool WriteAlignmentChunk(const BaiAlignmentChunk& chunk); - bool WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); - bool WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); - bool WriteBins(const int& refId, BaiBinMap& bins); - bool WriteHeader(void); - bool WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); - bool WriteReferenceEntry(BaiReferenceEntry& refEntry); + void WriteAlignmentChunk(const BaiAlignmentChunk& chunk); + void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); + void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); + void WriteBins(const int& refId, BaiBinMap& bins); + void WriteHeader(void); + void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); + void WriteReferenceEntry(BaiReferenceEntry& refEntry); // data members private: - FILE* m_indexStream; - bool m_isBigEndian; + bool m_isBigEndian; BamIndex::IndexCacheMode m_cacheMode; BaiFileSummary m_indexFileSummary; // our input buffer - char* m_buffer; unsigned int m_bufferLength; + struct RaiiWrapper { + FILE* IndexStream; + char* Buffer; + RaiiWrapper(void); + ~RaiiWrapper(void); + }; + RaiiWrapper Resources; + // static methods private: // checks if the buffer is large enough to accomodate the requested size diff --git a/src/api/internal/BamToolsIndex_p.cpp b/src/api/internal/BamToolsIndex_p.cpp index 047ca6f..09f74d1 100644 --- a/src/api/internal/BamToolsIndex_p.cpp +++ b/src/api/internal/BamToolsIndex_p.cpp @@ -2,12 +2,13 @@ // BamToolsIndex.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 27 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the BamTools index format (".bti") // *************************************************************************** #include +#include #include #include #include @@ -23,16 +24,35 @@ using namespace BamTools::Internal; #include using namespace std; +// -------------------------------- // static BamToolsIndex constants -const int BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; +// -------------------------------- + +const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; const string BamToolsIndex::BTI_EXTENSION = ".bti"; const char* const BamToolsIndex::BTI_MAGIC = "BTI\1"; const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t); +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BamToolsIndex::RaiiWrapper::RaiiWrapper(void) + : IndexStream(0) +{ } + +BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) { + if ( IndexStream ) + fclose(IndexStream); +} + +// ------------------------------ +// BamToolsIndex implementation +// ------------------------------ + // ctor BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader) : BamIndex(reader) - , m_indexStream(0) , m_cacheMode(BamIndex::LimitedIndexCaching) , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH) , m_inputVersion(0) @@ -46,71 +66,56 @@ BamToolsIndex::~BamToolsIndex(void) { CloseFile(); } -bool BamToolsIndex::CheckMagicNumber(void) { +void BamToolsIndex::CheckMagicNumber(void) { - // check 'magic number' to see if file is BTI index + // read magic number char magic[4]; - size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); - if ( elementsRead != 4 ) { - cerr << "BamToolsIndex ERROR: could not read format 'magic' number" << endl; - return false; - } - - if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) { - cerr << "BamToolsIndex ERROR: invalid format" << endl; - return false; - } + size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream); + if ( elementsRead != 4 ) + throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number"); - // otherwise ok - return true; + // validate expected magic number + if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) + throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number"); } // check index file version, return true if OK -bool BamToolsIndex::CheckVersion(void) { +void BamToolsIndex::CheckVersion(void) { // read version from file - size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); - if ( elementsRead != 1 ) return false; + size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::CheckVersion", "could not read format version"); if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); // if version is negative, or zero - if ( m_inputVersion <= 0 ) { - cerr << "BamToolsIndex ERROR: could not load index file: invalid version." - << endl; - return false; - } + if ( m_inputVersion <= 0 ) + throw BamException("BamToolsIndex::CheckVersion", "invalid format version"); // if version is newer than can be supported by this version of bamtools else if ( m_inputVersion > m_outputVersion ) { - cerr << "BamToolsIndex ERROR: could not load index file. This version of BamTools does not recognize new index file version" - << endl - << "Please update BamTools to a more recent version to support this index file." - << endl; - return false; + const string message = "unsupported format: this index was created by a newer version of BamTools. " + "Update your local version of BamTools to use the index file."; + throw BamException("BamToolsIndex::CheckVersion", message); } // ------------------------------------------------------------------ // check for deprecated, unsupported versions - // (typically whose format did not accomodate a particular bug fix) + // (the format had to be modified to accomodate a particular bug fix) else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) { - cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to accessing data near reference ends." - << endl << endl - << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file." - << endl << endl; - return false; + const string message = "unsupported format: this version of the index may not properly handle " + "reference ends. Please run 'bamtools index -bti -in yourData.bam' to " + "generate an up-to-date, fixed BTI file."; + throw BamException("BamToolsIndex::CheckVersion", message); } else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) { - cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to handling empty references." - << endl << endl - << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file." - << endl << endl; - return false; + const string message = "unsupported format: this version of the index may not properly handle " + "empty references. Please run 'bamtools index -bti -in yourData.bam to " + "generate an up-to-date, fixed BTI file."; + throw BamException("BamToolsIndex::CheckVersion", message); } - - // otherwise ok - else return true; } void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) { @@ -119,151 +124,155 @@ void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) { } void BamToolsIndex::CloseFile(void) { - if ( IsFileOpen() ) - fclose(m_indexStream); + if ( IsFileOpen() ) { + fclose(Resources.IndexStream); + Resources.IndexStream = 0; + } m_indexFileSummary.clear(); } // builds index from associated BAM file & writes out to index file bool BamToolsIndex::Create(void) { - // return false if BamReader is invalid or not open + // skip if BamReader is invalid or not open if ( m_reader == 0 || !m_reader->IsOpen() ) { - cerr << "BamToolsIndex ERROR: BamReader is not open" - << ", aborting index creation" << endl; + SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open"); return false; } // rewind BamReader if ( !m_reader->Rewind() ) { - cerr << "BamToolsIndex ERROR: could not rewind BamReader to create index" - << ", aborting index creation" << endl; + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); return false; } - // open new index file (read & write) - string indexFilename = m_reader->Filename() + Extension(); - if ( !OpenFile(indexFilename, "w+b") ) { - cerr << "BamToolsIndex ERROR: could not open output index file " << indexFilename - << ", aborting index creation" << endl; - return false; - } + try { + // open new index file (read & write) + string indexFilename = m_reader->Filename() + Extension(); + OpenFile(indexFilename, "w+b"); - // initialize BtiFileSummary with number of references - const int& numReferences = m_reader->GetReferenceCount(); - InitializeFileSummary(numReferences); + // initialize BtiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + InitializeFileSummary(numReferences); - // initialize output file - bool createdOk = true; - createdOk &= WriteHeader(); + // intialize output file header + WriteHeader(); - // index building markers - int32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = m_reader->Tell(); - int32_t blockRefId = -1; - int32_t blockMaxEndPosition = -1; - int64_t blockStartOffset = currentAlignmentOffset; - int32_t blockStartPosition = -1; + // index building markers + uint32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = m_reader->Tell(); + int32_t blockRefId = -1; + int32_t blockMaxEndPosition = -1; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; - // plow through alignments, storing index entries - BamAlignment al; - BtiReferenceEntry refEntry; - while ( m_reader->LoadNextAlignment(al) ) { + // plow through alignments, storing index entries + BamAlignment al; + BtiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { - // if moved to new reference - if ( al.RefID != blockRefId ) { + // if moved to new reference + if ( al.RefID != blockRefId ) { - // if first pass, check: - if ( currentBlockCount == 0 ) { + // if first pass, check: + if ( currentBlockCount == 0 ) { - // write any empty references up to (but not including) al.RefID - for ( int i = 0; i < al.RefID; ++i ) { - BtiReferenceEntry emptyEntry(i); - createdOk &= WriteReferenceEntry(emptyEntry); + // write any empty references up to (but not including) al.RefID + for ( int i = 0; i < al.RefID; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); } - } - // not first pass: - else { + // not first pass: + else { - // store previous BTI block data in reference entry - BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); - refEntry.Blocks.push_back(block); + // store previous BTI block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); - // write reference entry, then clear - createdOk &= WriteReferenceEntry(refEntry); - ClearReferenceEntry(refEntry); + // write reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); - // write any empty references between (but not including) the last blockRefID and current al.RefID - for ( int i = blockRefId+1; i < al.RefID; ++i ) { - BtiReferenceEntry emptyEntry(i); - createdOk &= WriteReferenceEntry(emptyEntry); + // write any empty references between (but not including) + // the last blockRefID and current al.RefID + for ( int i = blockRefId+1; i < al.RefID; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); + + // reset block count + currentBlockCount = 0; } - // reset block count - currentBlockCount = 0; + // set ID for new reference entry + refEntry.ID = al.RefID; } - // set ID for new reference entry - refEntry.ID = al.RefID; - } + // if beginning of block, update counters + if ( currentBlockCount == 0 ) { + blockRefId = al.RefID; + blockStartOffset = currentAlignmentOffset; + blockStartPosition = al.Position; + blockMaxEndPosition = al.GetEndPosition(); + } - // if beginning of block, update counters - if ( currentBlockCount == 0 ) { - blockRefId = al.RefID; - blockStartOffset = currentAlignmentOffset; - blockStartPosition = al.Position; - blockMaxEndPosition = al.GetEndPosition(); - } + // increment block counter + ++currentBlockCount; - // increment block counter - ++currentBlockCount; + // check end position + int32_t alignmentEndPosition = al.GetEndPosition(); + if ( alignmentEndPosition > blockMaxEndPosition ) + blockMaxEndPosition = alignmentEndPosition; - // check end position - int32_t alignmentEndPosition = al.GetEndPosition(); - if ( alignmentEndPosition > blockMaxEndPosition ) - blockMaxEndPosition = alignmentEndPosition; + // if block is full, get offset for next block, reset currentBlockCount + if ( currentBlockCount == m_blockSize ) { - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == m_blockSize ) { + // store previous block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); - // store previous block data in reference entry - BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); - refEntry.Blocks.push_back(block); + // update markers + blockStartOffset = m_reader->Tell(); + currentBlockCount = 0; + } - // update markers - blockStartOffset = m_reader->Tell(); - currentBlockCount = 0; + // not the best name, but for the next iteration, this value will be the offset of the + // *current* alignment. this is necessary because we won't know if this next alignment + // is on a new reference until we actually read it + currentAlignmentOffset = m_reader->Tell(); } - // not the best name, but for the next iteration, this value will be the offset of the *current* alignment - // necessary because we won't know if this next alignment is on a new reference until we actually read it - currentAlignmentOffset = m_reader->Tell(); - } + // after finishing alignments, if any data was read, check: + if ( blockRefId >= 0 ) { - // after finishing alignments, if any data was read, check: - if ( blockRefId >= 0 ) { - - // store last BTI block data in reference entry - BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); - refEntry.Blocks.push_back(block); + // store last BTI block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); - // write last reference entry, then clear - createdOk &= WriteReferenceEntry(refEntry); - ClearReferenceEntry(refEntry); + // write last reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); - // then write any empty references remaining at end of file - for ( int i = blockRefId+1; i < numReferences; ++i ) { - BtiReferenceEntry emptyEntry(i); - createdOk &= WriteReferenceEntry(emptyEntry); + // then write any empty references remaining at end of file + for ( int i = blockRefId+1; i < numReferences; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); } + + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; } - // rewind reader & return result - createdOk &= m_reader->Rewind(); + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); + return false; + } - // return result - return createdOk; + // return success + return true; } // returns format's file extension @@ -271,18 +280,15 @@ const std::string BamToolsIndex::Extension(void) { return BamToolsIndex::BTI_EXTENSION; } -bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { +void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { // return false ref ID is not a valid index in file summary data if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) - return false; + throw BamException("BamToolsIndex::GetOffset", "invalid region requested"); // retrieve reference index data for left bound reference BtiReferenceEntry refEntry(region.LeftRefID); - if ( !ReadReferenceEntry(refEntry) ) { - cerr << "BamToolsIndex ERROR: could not retrieve index data from BTI file" << endl; - return false; - } + ReadReferenceEntry(refEntry); // binary search for an overlapping block (may not be first one though) bool found = false; @@ -337,9 +343,6 @@ bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* ha // sets to false if blocks container is empty, or if no matching block could be found *hasAlignmentsInRegion = found; - - // return success - return true; } // returns whether reference has alignments or no @@ -350,14 +353,16 @@ bool BamToolsIndex::HasAlignments(const int& referenceID) const { return ( refSummary.NumBlocks > 0 ); } +// pre-allocates space for each reference's summary data void BamToolsIndex::InitializeFileSummary(const int& numReferences) { m_indexFileSummary.clear(); for ( int i = 0; i < numReferences; ++i ) m_indexFileSummary.push_back( BtiReferenceSummary() ); } +// returns true if the index stream is open bool BamToolsIndex::IsFileOpen(void) const { - return ( m_indexStream != 0 ); + return ( Resources.IndexStream != 0 ); } // attempts to use index data to jump to @region, returns success/fail @@ -370,19 +375,24 @@ bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsI *hasAlignmentsInRegion = false; // skip if invalid reader or not open - if ( m_reader == 0 || !m_reader->IsOpen() ) + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open"); return false; + } // make sure left-bound position is valid const RefVector& references = m_reader->GetReferenceData(); - if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) { + SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested"); return false; + } // calculate nearest offset to jump to int64_t offset; - if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { - cerr << "BamToolsIndex ERROR: could not jump" - << ", unable to calculate offset for specified region" << endl; + try { + GetOffset(region, offset, hasAlignmentsInRegion); + } catch ( BamException& e ) { + m_errorString = e.what(); return false; } @@ -393,116 +403,101 @@ bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsI // loads existing data from file into memory bool BamToolsIndex::Load(const std::string& filename) { - // attempt open index file (read-only) - if ( !OpenFile(filename, "rb") ) { - cerr << "BamToolsIndex ERROR: could not open input index file " << filename - << ", aborting index load" << endl; - return false; - } + try { - // attempt to load & validate BTI header data - if ( !LoadHeader() ) { - cerr << "BamToolsIndex ERROR: could load header from index file " << filename - << ", aborting index load" << endl; - CloseFile(); - return false; - } + // attempt to open file (read-only) + OpenFile(filename, "rb"); + + // load metadata & generate in-memory summary + LoadHeader(); + LoadFileSummary(); + + // return success + return true; - // attempt to load index file summary - if ( !LoadFileSummary() ) { - cerr << "BamToolsIndex ERROR: could not generate a summary of index file " << filename - << ", aborting index load" << endl; - CloseFile(); + } catch ( BamException& e ) { + m_errorString = e.what(); return false; } - - // if we get here, index summary is loaded OK - return true; } -bool BamToolsIndex::LoadFileSummary(void) { +void BamToolsIndex::LoadFileSummary(void) { // load number of reference sequences int numReferences; - if ( !LoadNumReferences(numReferences) ) - return false; + LoadNumReferences(numReferences); // initialize file summary data InitializeFileSummary(numReferences); - // iterate over reference entries - bool loadedOk = true; + // load summary for each reference BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); for ( ; summaryIter != summaryEnd; ++summaryIter ) - loadedOk &= LoadReferenceSummary(*summaryIter); - - // return result - return loadedOk; + LoadReferenceSummary(*summaryIter); } -bool BamToolsIndex::LoadHeader(void) { +void BamToolsIndex::LoadHeader(void) { - // if invalid format 'magic number' - if ( !CheckMagicNumber() ) - return false; - - // if invalid BTI version - if ( !CheckVersion() ) - return false; + // check BTI file metadata + CheckMagicNumber(); + CheckVersion(); // use file's BTI block size to set member variable - size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); + const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(m_blockSize); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size"); } -bool BamToolsIndex::LoadNumBlocks(int& numBlocks) { - size_t elementsRead = 0; - elementsRead += fread(&numBlocks, sizeof(numBlocks), 1, m_indexStream); +void BamToolsIndex::LoadNumBlocks(int& numBlocks) { + const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(numBlocks); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks"); } -bool BamToolsIndex::LoadNumReferences(int& numReferences) { - size_t elementsRead = 0; - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); +void BamToolsIndex::LoadNumReferences(int& numReferences) { + const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); if ( m_isBigEndian ) SwapEndian_32(numReferences); - return ( elementsRead == 1 ); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references"); } -bool BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) { +void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) { // load number of blocks int numBlocks; - if ( !LoadNumBlocks(numBlocks) ) - return false; + LoadNumBlocks(numBlocks); // store block summary data for this reference refSummary.NumBlocks = numBlocks; refSummary.FirstBlockFilePosition = Tell(); - // skip blocks in index file (and return status) - return SkipBlocks(numBlocks); + // skip reference's blocks + SkipBlocks(numBlocks); } -bool BamToolsIndex::OpenFile(const std::string& filename, const char* mode) { +void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) { // make sure any previous index file is closed CloseFile(); // attempt to open file - m_indexStream = fopen(filename.c_str(), mode); - return IsFileOpen(); + Resources.IndexStream = fopen(filename.c_str(), mode); + if ( !IsFileOpen() ) { + const string message = string("could not open file: ") + filename; + throw BamException("BamToolsIndex::OpenFile", message); + } } -bool BamToolsIndex::ReadBlock(BtiBlock& block) { +void BamToolsIndex::ReadBlock(BtiBlock& block) { // read in block data members size_t elementsRead = 0; - elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, m_indexStream); - elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, m_indexStream); - elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, m_indexStream); + elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream); + elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream); + elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream); // swap endian-ness if necessary if ( m_isBigEndian ) { @@ -511,46 +506,41 @@ bool BamToolsIndex::ReadBlock(BtiBlock& block) { SwapEndian_32(block.StartPosition); } - // return success/failure - return ( elementsRead == 3 ); + if ( elementsRead != 3 ) + throw BamException("BamToolsIndex::ReadBlock", "could not read block"); } -bool BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) { +void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) { // prep blocks container blocks.clear(); blocks.reserve(refSummary.NumBlocks); // skip to first block entry - if ( !Seek( refSummary.FirstBlockFilePosition, SEEK_SET ) ) { - cerr << "BamToolsIndex ERROR: could not seek to position " - << refSummary.FirstBlockFilePosition << endl; - return false; - } + Seek( refSummary.FirstBlockFilePosition, SEEK_SET ); // read & store block entries - bool readOk = true; BtiBlock block; for ( int i = 0; i < refSummary.NumBlocks; ++i ) { - readOk &= ReadBlock(block); + ReadBlock(block); blocks.push_back(block); } - return readOk; } -bool BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) { +void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) { // return false if refId not valid index in file summary structure if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() ) - return false; + throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested"); // use index summary to assist reading the reference's BTI blocks const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID); - return ReadBlocks(refSummary, refEntry.Blocks); + ReadBlocks(refSummary, refEntry.Blocks); } -bool BamToolsIndex::Seek(const int64_t& position, const int& origin) { - return ( fseek64(m_indexStream, position, origin) == 0 ); +void BamToolsIndex::Seek(const int64_t& position, const int& origin) { + if ( fseek64(Resources.IndexStream, position, origin) != 0 ) + throw BamException("BamToolsIndex::Seek", "could not seek in BAI file"); } // change the index caching behavior @@ -559,15 +549,15 @@ void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { // do nothing else here ? cache mode will be ignored from now on, most likely } -bool BamToolsIndex::SkipBlocks(const int& numBlocks) { - return Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR ); +void BamToolsIndex::SkipBlocks(const int& numBlocks) { + Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR ); } int64_t BamToolsIndex::Tell(void) const { - return ftell64(m_indexStream); + return ftell64(Resources.IndexStream); } -bool BamToolsIndex::WriteBlock(const BtiBlock& block) { +void BamToolsIndex::WriteBlock(const BtiBlock& block) { // copy entry data int32_t maxEndPosition = block.MaxEndPosition; @@ -583,59 +573,55 @@ bool BamToolsIndex::WriteBlock(const BtiBlock& block) { // write the reference index entry size_t elementsWritten = 0; - elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); - elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); - elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); - return ( elementsWritten == 3 ); + elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream); + elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream); + elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream); + if ( elementsWritten != 3 ) + throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block"); } -bool BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) { - bool writtenOk = true; +void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) { BtiBlockVector::const_iterator blockIter = blocks.begin(); BtiBlockVector::const_iterator blockEnd = blocks.end(); for ( ; blockIter != blockEnd; ++blockIter ) - writtenOk &= WriteBlock(*blockIter); - return writtenOk; + WriteBlock(*blockIter); } -bool BamToolsIndex::WriteHeader(void) { +void BamToolsIndex::WriteHeader(void) { size_t elementsWritten = 0; // write BTI index format 'magic number' - elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, m_indexStream); + elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream); // write BTI index format version int32_t currentVersion = (int32_t)m_outputVersion; if ( m_isBigEndian ) SwapEndian_32(currentVersion); - elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream); + elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream); // write block size - int32_t blockSize = m_blockSize; + uint32_t blockSize = m_blockSize; if ( m_isBigEndian ) SwapEndian_32(blockSize); - elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); + elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream); // write number of references int32_t numReferences = m_indexFileSummary.size(); if ( m_isBigEndian ) SwapEndian_32(numReferences); - elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); - // return success/failure of write - return ( elementsWritten == 7 ); + if ( elementsWritten != 7 ) + throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header"); } -bool BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) { - - size_t elementsWritten = 0; +void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) { // write number of blocks this reference uint32_t numBlocks = refEntry.Blocks.size(); if ( m_isBigEndian ) SwapEndian_32(numBlocks); - elementsWritten += fwrite(&numBlocks, sizeof(numBlocks), 1, m_indexStream); + const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream); + if ( elementsWritten != 1 ) + throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks"); // write actual block entries - const bool blocksOk = WriteBlocks(refEntry.Blocks); - - // return success/fail - return ( elementsWritten == 1) && blocksOk; + WriteBlocks(refEntry.Blocks); } diff --git a/src/api/internal/BamToolsIndex_p.h b/src/api/internal/BamToolsIndex_p.h index 269792e..ec846c5 100644 --- a/src/api/internal/BamToolsIndex_p.h +++ b/src/api/internal/BamToolsIndex_p.h @@ -2,7 +2,7 @@ // BamToolsIndex.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 5 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the BamTools index format (".bti") // *************************************************************************** @@ -122,60 +122,59 @@ class BamToolsIndex : public BamIndex { // returns format's file extension static const std::string Extension(void); - // internal file ops + // internal methods private: - bool CheckMagicNumber(void); - bool CheckVersion(void); + + // index file ops + void CheckMagicNumber(void); + void CheckVersion(void); void CloseFile(void); bool IsFileOpen(void) const; - bool OpenFile(const std::string& filename, const char* mode); - bool Seek(const int64_t& position, const int& origin); + void OpenFile(const std::string& filename, const char* mode); + void Seek(const int64_t& position, const int& origin); int64_t Tell(void) const; - // internal BTI index building methods - private: + // index-creation methods void ClearReferenceEntry(BtiReferenceEntry& refEntry); - - // internal random-access methods - private: - bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); - - // internal BTI summary data methods - private: + void WriteBlock(const BtiBlock& block); + void WriteBlocks(const BtiBlockVector& blocks); + void WriteHeader(void); + void WriteReferenceEntry(const BtiReferenceEntry& refEntry); + + // random-access methods + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + void ReadBlock(BtiBlock& block); + void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); + void ReadReferenceEntry(BtiReferenceEntry& refEntry); + + // BTI summary data methods void InitializeFileSummary(const int& numReferences); - bool LoadFileSummary(void); - bool LoadHeader(void); - bool LoadNumBlocks(int& numBlocks); - bool LoadNumReferences(int& numReferences); - bool LoadReferenceSummary(BtiReferenceSummary& refSummary); - bool SkipBlocks(const int& numBlocks); - - // internal BTI full index input methods - private: - bool ReadBlock(BtiBlock& block); - bool ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); - bool ReadReferenceEntry(BtiReferenceEntry& refEntry); - - // internal BTI full index output methods - private: - bool WriteBlock(const BtiBlock& block); - bool WriteBlocks(const BtiBlockVector& blocks); - bool WriteHeader(void); - bool WriteReferenceEntry(const BtiReferenceEntry& refEntry); + void LoadFileSummary(void); + void LoadHeader(void); + void LoadNumBlocks(int& numBlocks); + void LoadNumReferences(int& numReferences); + void LoadReferenceSummary(BtiReferenceSummary& refSummary); + void SkipBlocks(const int& numBlocks); // data members private: - FILE* m_indexStream; bool m_isBigEndian; BamIndex::IndexCacheMode m_cacheMode; BtiFileSummary m_indexFileSummary; - int m_blockSize; + uint32_t m_blockSize; int32_t m_inputVersion; // Version is serialized as int Version m_outputVersion; + struct RaiiWrapper { + FILE* IndexStream; + RaiiWrapper(void); + ~RaiiWrapper(void); + }; + RaiiWrapper Resources; + // static constants private: - static const int DEFAULT_BLOCK_LENGTH; + static const uint32_t DEFAULT_BLOCK_LENGTH; static const std::string BTI_EXTENSION; static const char* const BTI_MAGIC; static const int SIZEOF_BLOCK; diff --git a/src/api/internal/BamWriter_p.cpp b/src/api/internal/BamWriter_p.cpp index e77c099..a8fe370 100644 --- a/src/api/internal/BamWriter_p.cpp +++ b/src/api/internal/BamWriter_p.cpp @@ -2,18 +2,18 @@ // BamWriter_p.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 16 June 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** #include #include +#include #include using namespace BamTools; using namespace BamTools::Internal; -#include #include #include using namespace std; @@ -25,11 +25,11 @@ BamWriterPrivate::BamWriterPrivate(void) // dtor BamWriterPrivate::~BamWriterPrivate(void) { - m_stream.Close(); + Close(); } // calculates minimum bin for a BAM alignment interval -unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { +uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { --end; if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); @@ -41,14 +41,23 @@ unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) con // closes the alignment archive void BamWriterPrivate::Close(void) { - m_stream.Close(); + + // skip if file not open + if ( !IsOpen() ) return; + + // close output stream + try { + m_stream.Close(); + } catch ( BamException& e ) { + m_errorString = e.what(); + } } // creates a cigar string from the supplied alignment void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { // initialize - const unsigned int numCigarOperations = cigarOperations.size(); + const size_t numCigarOperations = cigarOperations.size(); packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT); // pack the cigar data into the string @@ -60,7 +69,7 @@ void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, for ( ; coIter != coEnd; ++coIter ) { // store op in packedCigar - unsigned int cigarOp; + uint8_t cigarOp; switch ( coIter->Type ) { case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break; case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break; @@ -72,8 +81,8 @@ void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break; case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break; default: - fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type); - exit(1); + const string message = string("invalid CIGAR operation type") + coIter->Type; + throw BamException("BamWriter::CreatePackedCigar", message); } *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp; @@ -85,26 +94,36 @@ void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); + const size_t queryLength = query.size(); + const size_t encodedQueryLength = static_cast((queryLength+1)/2); + encodedQuery.resize(encodedQueryLength); char* pEncodedQuery = (char*)encodedQuery.data(); const char* pQuery = (const char*)query.data(); + // walk through original query sequence, encoding its bases unsigned char nucleotideCode; bool useHighWord = true; - while ( *pQuery ) { switch ( *pQuery ) { case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break; case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break; case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break; + case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break; case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break; + case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break; + case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break; + case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break; case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break; + case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break; + case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break; + case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break; + case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break; + case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break; + case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break; case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break; default: - fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); + const string message = string("invalid base: ") + *pQuery; + throw BamException("BamWriter::EncodeQuerySequence", message); } // pack the nucleotide code @@ -122,6 +141,11 @@ void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQ } } +// returns a description of the last error that occurred +std::string BamWriterPrivate::GetErrorString(void) const { + return m_errorString; +} + // returns whether BAM file is open for writing or not bool BamWriterPrivate::IsOpen(void) const { return m_stream.IsOpen; @@ -132,251 +156,265 @@ bool BamWriterPrivate::Open(const string& filename, const string& samHeaderText, const RefVector& referenceSequences) { - // open the BGZF file for writing, return failure if error - if ( !m_stream.Open(filename, "wb") ) - return false; + try { - // write BAM file 'metadata' components - WriteMagicNumber(); - WriteSamHeaderText(samHeaderText); - WriteReferences(referenceSequences); - return true; -} + // open the BGZF file for writing, return failure if error + m_stream.Open(filename, "wb"); -// saves the alignment to the alignment archive -void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); - m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); - - // re-calculate bin (in case BamAlignment's position has been previously modified) - const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); - - // assign the BAM core data - uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( m_isBigEndian ) { - for ( int i = 0; i < 8; ++i ) - BamTools::SwapEndian_32(buffer[i]); - } + // write BAM file 'metadata' components + WriteMagicNumber(); + WriteSamHeaderText(samHeaderText); + WriteReferences(referenceSequences); - // write the BAM core - m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + // return success + return true; - // write the raw char data - m_stream.Write((char*)al.SupportData.AllCharData.data(), - al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; } +} - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) - else { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + - packedCigarLength + - encodedQueryLength + - queryLength + - tagDataLength; - unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; - if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); - m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( m_isBigEndian ) { - for ( int i = 0; i < 8; ++i ) - BamTools::SwapEndian_32(buffer[i]); - } - - // write the BAM core - m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - - // write the query name - m_stream.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( m_isBigEndian ) { - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - if ( m_isBigEndian ) { - for ( unsigned int i = 0; i < packedCigarLength; ++i ) - BamTools::SwapEndian_32p(&cigarData[i]); - } - m_stream.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - m_stream.Write(packedCigar.data(), packedCigarLength); +// saves the alignment to the alignment archive +bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - // write the encoded query sequence - m_stream.Write(encodedQuery.data(), encodedQueryLength); + try { - // write the base qualities - char* pBaseQualities = (char*)al.Qualities.data(); - for ( unsigned int i = 0; i < queryLength; ++i ) - pBaseQualities[i] -= 33; // FASTQ conversion - m_stream.Write(pBaseQualities, queryLength); + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) + WriteCoreAlignment(al); - // write the read group tag - if ( m_isBigEndian ) { + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code) + else WriteAlignment(al); - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); + // if we get here, everything OK + return true; - int i = 0; - while ( (unsigned int)i < tagDataLength ) { + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } +} - i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) - const char type = tagData[i]; // get tag type at position i - ++i; +void BamWriterPrivate::SetWriteCompressed(bool ok) { + // modifying compression is not allowed if BAM file is open + if ( !IsOpen() ) + m_stream.SetWriteCompressed(ok); +} - switch ( type ) { +void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + + packedCigarLength + + encodedQueryLength + + queryLength + + tagDataLength; + unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } - case(Constants::BAM_TAG_TYPE_ASCII) : - case(Constants::BAM_TAG_TYPE_INT8) : - case(Constants::BAM_TAG_TYPE_UINT8) : - ++i; - break; - - case(Constants::BAM_TAG_TYPE_INT16) : - case(Constants::BAM_TAG_TYPE_UINT16) : - BamTools::SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - - case(Constants::BAM_TAG_TYPE_FLOAT) : - case(Constants::BAM_TAG_TYPE_INT32) : - case(Constants::BAM_TAG_TYPE_UINT32) : - BamTools::SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - - case(Constants::BAM_TAG_TYPE_HEX) : - case(Constants::BAM_TAG_TYPE_STRING) : - // no endian swapping necessary for hex-string/string data - while ( tagData[i] ) - ++i; - // increment one more for null terminator - ++i; - break; + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - case(Constants::BAM_TAG_TYPE_ARRAY) : + // write the query name + m_stream.Write(al.Name.c_str(), nameLength); - { - // read array type - const char arrayType = tagData[i]; + // write the packed cigar + if ( m_isBigEndian ) { + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if ( m_isBigEndian ) { + for ( size_t i = 0; i < packedCigarLength; ++i ) + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? + } + else + m_stream.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + m_stream.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + char* pBaseQualities = (char*)al.Qualities.data(); + for ( size_t i = 0; i < queryLength; ++i ) + pBaseQualities[i] -= 33; // FASTQ conversion + m_stream.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( m_isBigEndian ) { + + char* tagData = new char[tagDataLength](); + memcpy(tagData, al.TagData.data(), tagDataLength); + + size_t i = 0; + while ( i < tagDataLength ) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; + + switch ( type ) { + + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : + ++i; + break; + + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data + while ( tagData[i] ) ++i; - - // swap endian-ness of number of elements in place, then retrieve for loop - BamTools::SwapEndian_32p(&tagData[i]); - int32_t numElements; - memcpy(&numElements, &tagData[i], sizeof(uint32_t)); - i += sizeof(uint32_t); - - // swap endian-ness of array elements - for ( int j = 0; j < numElements; ++j ) { - switch (arrayType) { - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - // no endian-swapping necessary - ++i; - break; - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - BamTools::SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - BamTools::SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - default: - // error case - fprintf(stderr, - "BamWriter ERROR: unknown binary array type encountered: [%c]\n", - arrayType); - exit(1); - } + // increment one more for null terminator + ++i; + break; + + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + delete[] tagData; + const string message = string("invalid binary array type: ") + arrayType; + throw BamException("BamWriter::SaveAlignment", message); } - - break; } - default : - fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); + break; } + + default : + delete[] tagData; + const string message = string("invalid tag type: ") + type; + throw BamException("BamWriter::SaveAlignment", message); } - m_stream.Write(tagData, tagDataLength); - free(tagData); } - else - m_stream.Write(al.TagData.data(), tagDataLength); + + m_stream.Write(tagData, tagDataLength); + delete[] tagData; // TODO: cleanup on Write exception thrown? } + else + m_stream.Write(al.TagData.data(), tagDataLength); } -void BamWriterPrivate::SetWriteCompressed(bool ok) { - - // warn if BAM file is already open - // modifying compression is not allowed in this case - if ( IsOpen() ) { - cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. " - << "Ignoring request." << endl; - return; +void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // re-calculate bin (in case BamAlignment's position has been previously modified) + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); } - // set BgzfStream compression mode - m_stream.SetWriteCompressed(ok); + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the raw char data + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); } void BamWriterPrivate::WriteMagicNumber(void) { diff --git a/src/api/internal/BamWriter_p.h b/src/api/internal/BamWriter_p.h index caa440e..e3547fe 100644 --- a/src/api/internal/BamWriter_p.h +++ b/src/api/internal/BamWriter_p.h @@ -2,7 +2,7 @@ // BamWriter_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 24 February 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -26,6 +26,9 @@ #include namespace BamTools { + +class BamAlignment; + namespace Internal { class BamWriterPrivate { @@ -38,18 +41,21 @@ class BamWriterPrivate { // interface methods public: void Close(void); + std::string GetErrorString(void) const; bool IsOpen(void) const; bool Open(const std::string& filename, const std::string& samHeaderText, const BamTools::RefVector& referenceSequences); - void SaveAlignment(const BamAlignment& al); + bool SaveAlignment(const BamAlignment& al); void SetWriteCompressed(bool ok); // 'internal' methods public: - unsigned int CalculateMinimumBin(const int begin, int end) const; + uint32_t CalculateMinimumBin(const int begin, int end) const; void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + void WriteAlignment(const BamAlignment& al); + void WriteCoreAlignment(const BamAlignment& al); void WriteMagicNumber(void); void WriteReferences(const BamTools::RefVector& referenceSequences); void WriteSamHeaderText(const std::string& samHeaderText); @@ -58,6 +64,7 @@ class BamWriterPrivate { private: BgzfStream m_stream; bool m_isBigEndian; + std::string m_errorString; }; } // namespace Internal diff --git a/src/api/internal/BgzfStream_p.cpp b/src/api/internal/BgzfStream_p.cpp index 36599b5..69592d6 100644 --- a/src/api/internal/BgzfStream_p.cpp +++ b/src/api/internal/BgzfStream_p.cpp @@ -2,84 +2,120 @@ // BgzfStream_p.cpp (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 2 September 2011(DB) +// Last modified: 6 October 2011(DB) // --------------------------------------------------------------------------- // Based on BGZF routines developed at the Broad Institute. // Provides the basic functionality for reading & writing BGZF files // Replaces the old BGZF.* files to avoid clashing with other toolkits // *************************************************************************** +#include #include using namespace BamTools; using namespace BamTools::Internal; #include #include +#include using namespace std; +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BgzfStream::RaiiWrapper::RaiiWrapper(void) + : Stream(0) +{ + CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE]; + UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE]; +} + +BgzfStream::RaiiWrapper::~RaiiWrapper(void) { + + // clean up buffers + delete[] CompressedBlock; + delete[] UncompressedBlock; + CompressedBlock = 0; + UncompressedBlock = 0; + + if ( Stream ) { + fflush(Stream); + fclose(Stream); + Stream = 0; + } +} + +// --------------------------- +// BgzfStream implementation +// --------------------------- + // constructor BgzfStream::BgzfStream(void) - : UncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE) - , BlockLength(0) + : BlockLength(0) , BlockOffset(0) , BlockAddress(0) , IsOpen(false) , IsWriteOnly(false) , IsWriteCompressed(true) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) -{ - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n"); - exit(1); - } -} +{ } // destructor BgzfStream::~BgzfStream(void) { - if( CompressedBlock ) delete[] CompressedBlock; - if( UncompressedBlock ) delete[] UncompressedBlock; + Close(); +} + +// checks BGZF block header +bool BgzfStream::CheckBlockHeader(char* header) { + return (header[0] == Constants::GZIP_ID1 && + header[1] == Constants::GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & Constants::FLG_FEXTRA) != 0 && + BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && + header[12] == Constants::BGZF_ID1 && + header[13] == Constants::BGZF_ID2 && + BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); } // closes BGZF file void BgzfStream::Close(void) { // skip if file not open - if ( !IsOpen ) return; + if ( !IsOpen ) + return; // if writing to file, flush the current BGZF block, // then write an empty block (as EOF marker) if ( IsWriteOnly ) { FlushBlock(); - int blockLength = DeflateBlock(); - fwrite(CompressedBlock, 1, blockLength, Stream); + const size_t blockLength = DeflateBlock(); + fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream); } // flush and close stream - fflush(Stream); - fclose(Stream); - - // reset flags - IsWriteCompressed = true; + fflush(Resources.Stream); + fclose(Resources.Stream); + Resources.Stream = 0; + + // reset initial state + BlockLength = 0; + BlockOffset = 0; + BlockAddress = 0; IsOpen = false; + IsWriteOnly = false; + IsWriteCompressed = true; } // compresses the current block -int BgzfStream::DeflateBlock(void) { +size_t BgzfStream::DeflateBlock(void) { // initialize the gzip header - char* buffer = CompressedBlock; + char* buffer = Resources.CompressedBlock; memset(buffer, 0, 18); buffer[0] = Constants::GZIP_ID1; - buffer[1] = (char)Constants::GZIP_ID2; + buffer[1] = Constants::GZIP_ID2; buffer[2] = Constants::CM_DEFLATE; buffer[3] = Constants::FLG_FEXTRA; - buffer[9] = (char)Constants::OS_UNKNOWN; + buffer[9] = Constants::OS_UNKNOWN; buffer[10] = Constants::BGZF_XLEN; buffer[12] = Constants::BGZF_ID1; buffer[13] = Constants::BGZF_ID2; @@ -90,8 +126,8 @@ int BgzfStream::DeflateBlock(void) { // loop to retry for blocks that do not compress enough int inputLength = BlockOffset; - int compressedLength = 0; - unsigned int bufferSize = CompressedBlockSize; + size_t compressedLength = 0; + const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE; while ( true ) { @@ -99,82 +135,78 @@ int BgzfStream::DeflateBlock(void) { z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; - zs.next_in = (Bytef*)UncompressedBlock; + zs.next_in = (Bytef*)Resources.UncompressedBlock; zs.avail_in = inputLength; zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH; + zs.avail_out = bufferSize - + Constants::BGZF_BLOCK_HEADER_LENGTH - + Constants::BGZF_BLOCK_FOOTER_LENGTH; // initialize the zlib compression algorithm - if ( deflateInit2(&zs, - compressionLevel, - Z_DEFLATED, - Constants::GZIP_WINDOW_BITS, - Constants::Z_DEFAULT_MEM_LEVEL, - Z_DEFAULT_STRATEGY) != Z_OK ) - { - fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n"); - exit(1); - } + int status = deflateInit2(&zs, + compressionLevel, + Z_DEFLATED, + Constants::GZIP_WINDOW_BITS, + Constants::Z_DEFAULT_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed"); // compress the data - int status = deflate(&zs, Z_FINISH); + status = deflate(&zs, Z_FINISH); + + // if not at stream end if ( status != Z_STREAM_END ) { deflateEnd(&zs); - // reduce the input length and try again - if ( status == Z_OK ) { - inputLength -= 1024; - if ( inputLength < 0 ) { - fprintf(stderr, "BgzfStream ERROR: input reduction failed\n"); - exit(1); - } - continue; - } - - fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); - exit(1); - } - - // finalize the compression routine - if ( deflateEnd(&zs) != Z_OK ) { - fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); - exit(1); - } + // if error status + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed"); - compressedLength = zs.total_out; - compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH; - if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) { - fprintf(stderr, "BgzfStream ERROR: deflate overflow\n"); - exit(1); + // not enough space available in buffer + // try to reduce the input length & re-start loop + inputLength -= 1024; + if ( inputLength <= 0 ) + throw BamException("BgzfStream::DeflateBlock", "input reduction failed"); + continue; } + // finalize the compression routine + status = deflateEnd(&zs); + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed"); + + // update compressedLength + compressedLength = zs.total_out + + Constants::BGZF_BLOCK_HEADER_LENGTH + + Constants::BGZF_BLOCK_FOOTER_LENGTH; + if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) + throw BamException("BgzfStream::DeflateBlock", "deflate overflow"); + + // quit while loop break; } // store the compressed length - BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + BamTools::PackUnsignedShort(&buffer[16], static_cast(compressedLength - 1)); // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); + uint32_t crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength); BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc); BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); // ensure that we have less than a block of data left int remaining = BlockOffset - inputLength; if ( remaining > 0 ) { - if ( remaining > inputLength ) { - fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n"); - exit(1); - } - memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); + if ( remaining > inputLength ) + throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large"); + memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining); } - // update block data + // update block data & return compressedlength BlockOffset = remaining; - - // return result return compressedLength; } @@ -185,14 +217,15 @@ void BgzfStream::FlushBlock(void) { while ( BlockOffset > 0 ) { // compress the data block - int blockLength = DeflateBlock(); + const size_t blockLength = DeflateBlock(); // flush the data to our output stream - int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); + const size_t numBytesWritten = fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream); if ( numBytesWritten != blockLength ) { - fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n", - blockLength, numBytesWritten); - exit(1); + stringstream s(""); + s << "expected to write " << blockLength + << " bytes during flushing, but wrote " << numBytesWritten; + throw BamException("BgzfStream::FlushBlock", s.str()); } // update block data @@ -201,34 +234,34 @@ void BgzfStream::FlushBlock(void) { } // decompresses the current block -int BgzfStream::InflateBlock(const int& blockLength) { +size_t BgzfStream::InflateBlock(const size_t& blockLength) { - // inflate the data from compressed buffer into uncompressed buffer + // setup zlib stream object z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; - zs.next_in = (Bytef*)CompressedBlock + 18; + zs.next_in = (Bytef*)Resources.CompressedBlock + 18; zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)UncompressedBlock; - zs.avail_out = UncompressedBlockSize; + zs.next_out = (Bytef*)Resources.UncompressedBlock; + zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE; + // initialize int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); - if ( status != Z_OK ) { - fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n"); - return -1; - } + if ( status != Z_OK ) + throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed"); + // decompress status = inflate(&zs, Z_FINISH); if ( status != Z_STREAM_END ) { inflateEnd(&zs); - fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n"); - return -1; + throw BamException("BgzfStream::InflateBlock", "zlib inflate failed"); } + // finalize status = inflateEnd(&zs); if ( status != Z_OK ) { - fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n"); - return -1; + inflateEnd(&zs); + throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed"); } // return result @@ -236,10 +269,11 @@ int BgzfStream::InflateBlock(const int& blockLength) { } // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfStream::Open(const string& filename, const char* mode) { +void BgzfStream::Open(const string& filename, const char* mode) { - // close current stream, if necessary, before opening next - if ( IsOpen ) Close(); + // make sure we're starting with fresh state + if ( IsOpen ) + Close(); // determine open mode if ( strcmp(mode, "rb") == 0 ) @@ -247,42 +281,41 @@ bool BgzfStream::Open(const string& filename, const char* mode) { else if ( strcmp(mode, "wb") == 0) IsWriteOnly = true; else { - fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode); - return false; + const string message = string("unknown file mode: ") + mode; + throw BamException("BgzfStream::Open", message); } // open BGZF stream on a file if ( (filename != "stdin") && (filename != "stdout") && (filename != "-")) - Stream = fopen(filename.c_str(), mode); + Resources.Stream = fopen(filename.c_str(), mode); // open BGZF stream on stdin else if ( (filename == "stdin" || filename == "-") && (strcmp(mode, "rb") == 0 ) ) - Stream = freopen(NULL, mode, stdin); + Resources.Stream = freopen(NULL, mode, stdin); // open BGZF stream on stdout else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) ) - Stream = freopen(NULL, mode, stdout); + Resources.Stream = freopen(NULL, mode, stdout); - if ( !Stream ) { - fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() ); - return false; + // ensure valid Stream + if ( !Resources.Stream ) { + const string message = string("unable to open file: ") + filename; + throw BamException("BgzfStream::Open", message); } - // set flag & return success + // set flag IsOpen = true; - return true; } // reads BGZF data into a byte buffer -int BgzfStream::Read(char* data, const unsigned int dataLength) { +size_t BgzfStream::Read(char* data, const size_t dataLength) { // if stream not open for reading (or empty request) if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; // read blocks as needed until desired data length is retrieved - char* output = data; - unsigned int numBytesRead = 0; + size_t numBytesRead = 0; while ( numBytesRead < dataLength ) { // determine bytes available in current block @@ -290,110 +323,95 @@ int BgzfStream::Read(char* data, const unsigned int dataLength) { // read (and decompress) next block if needed if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; + ReadBlock(); bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) break; + if ( bytesAvailable <= 0 ) + break; } // copy data from uncompressed source buffer into data destination buffer - char* buffer = UncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + BlockOffset, copyLength); + const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable ); + memcpy(data, Resources.UncompressedBlock + BlockOffset, copyLength); // update counters BlockOffset += copyLength; - output += copyLength; + data += copyLength; numBytesRead += copyLength; } // update block data if ( BlockOffset == BlockLength ) { - BlockAddress = ftell64(Stream); + BlockAddress = ftell64(Resources.Stream); BlockOffset = 0; BlockLength = 0; } + // return actual number of bytes read return numBytesRead; } // reads a BGZF block -bool BgzfStream::ReadBlock(void) { +void BgzfStream::ReadBlock(void) { - char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; - int64_t blockAddress = ftell64(Stream); + // store block start + int64_t blockAddress = ftell64(Resources.Stream); // read block header from file - int count = fread(header, 1, sizeof(header), Stream); + char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; + size_t count = fread(header, 1, Constants::BGZF_BLOCK_HEADER_LENGTH, Resources.Stream); - // if block header empty + // if block header empty, set marker & skip rest of method if ( count == 0 ) { BlockLength = 0; - return true; + return; } // if block header invalid size - if ( count != sizeof(header) ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n"); - return false; - } + if ( count != sizeof(header) ) + throw BamException("BgzfStream::ReadBlock", "invalid block header size"); // validate block header contents - if ( !BgzfStream::CheckBlockHeader(header) ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n"); - return false; - } + if ( !BgzfStream::CheckBlockHeader(header) ) + throw BamException("BgzfStream::ReadBlock", "invalid block header contents"); // copy header contents to compressed buffer - int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = CompressedBlock; - memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); - int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; + memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); // read remainder of block - count = fread(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Stream); - if ( count != remaining ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n"); - return false; - } + const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + count = fread(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Resources.Stream); + if ( count != remaining ) + throw BamException("BgzfStream::ReadBlock", "could not read data from block"); // decompress block data count = InflateBlock(blockLength); - if ( count < 0 ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n"); - return false; - } - // update block data + // update block metadata if ( BlockLength != 0 ) BlockOffset = 0; BlockAddress = blockAddress; BlockLength = count; - - // return success - return true; } // seek to position in BGZF file -bool BgzfStream::Seek(const int64_t& position) { - - // skip if not open - if ( !IsOpen ) return false; +void BgzfStream::Seek(const int64_t& position) { // determine adjusted offset & address int blockOffset = (position & 0xFFFF); int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; // attempt seek in file - if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { - fprintf(stderr, "BgzfStream ERROR: unable to seek in file\n"); - return false; + if ( fseek64(Resources.Stream, blockAddress, SEEK_SET) != 0 ) { + stringstream s(""); + s << "unable to seek to position: " << position; + throw BamException("BgzfStream::Seek", s.str()); } - // update block data & return success + // if successful, update block metadata BlockLength = 0; BlockAddress = blockAddress; BlockOffset = blockOffset; - return true; } void BgzfStream::SetWriteCompressed(bool ok) { @@ -408,31 +426,33 @@ int64_t BgzfStream::Tell(void) const { } // writes the supplied data into the BGZF buffer -unsigned int BgzfStream::Write(const char* data, const unsigned int dataLen) { +size_t BgzfStream::Write(const char* data, const size_t dataLength) { // skip if file not open for writing - if ( !IsOpen || !IsWriteOnly ) return false; + if ( !IsOpen || !IsWriteOnly ) + return false; // write blocks as needed til all data is written - unsigned int numBytesWritten = 0; + size_t numBytesWritten = 0; const char* input = data; - unsigned int blockLength = UncompressedBlockSize; - while ( numBytesWritten < dataLen ) { + const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE; + while ( numBytesWritten < dataLength ) { // copy data contents to uncompressed output buffer - unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); - char* buffer = UncompressedBlock; + const size_t copyLength = min(blockLength - BlockOffset, dataLength - numBytesWritten); + char* buffer = Resources.UncompressedBlock; memcpy(buffer + BlockOffset, input, copyLength); - // update counter + // update counters BlockOffset += copyLength; input += copyLength; numBytesWritten += copyLength; // flush (& compress) output buffer when full - if ( BlockOffset == blockLength ) FlushBlock(); + if ( BlockOffset == blockLength ) + FlushBlock(); } - // return result + // return actual number of bytes written return numBytesWritten; } diff --git a/src/api/internal/BgzfStream_p.h b/src/api/internal/BgzfStream_p.h index 838f30c..7ebd647 100644 --- a/src/api/internal/BgzfStream_p.h +++ b/src/api/internal/BgzfStream_p.h @@ -26,6 +26,7 @@ #include #include "zlib.h" #include +#include #include namespace BamTools { @@ -43,64 +44,53 @@ class BgzfStream { // closes BGZF file void Close(void); // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) - bool Open(const std::string& filename, const char* mode); + void Open(const std::string& filename, const char* mode); // reads BGZF data into a byte buffer - int Read(char* data, const unsigned int dataLength); + size_t Read(char* data, const size_t dataLength); // seek to position in BGZF file - bool Seek(const int64_t& position); + void Seek(const int64_t& position); // enable/disable compressed output void SetWriteCompressed(bool ok); // get file position in BGZF file int64_t Tell(void) const; // writes the supplied data into the BGZF buffer - unsigned int Write(const char* data, const unsigned int dataLen); + size_t Write(const char* data, const size_t dataLength); // internal methods private: // compresses the current block - int DeflateBlock(void); + size_t DeflateBlock(void); // flushes the data in the BGZF block void FlushBlock(void); // de-compresses the current block - int InflateBlock(const int& blockLength); + size_t InflateBlock(const size_t& blockLength); // reads a BGZF block - bool ReadBlock(void); + void ReadBlock(void); // static 'utility' methods public: // checks BGZF block header - static inline bool CheckBlockHeader(char* header); + static bool CheckBlockHeader(char* header); // data members public: - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; unsigned int BlockLength; unsigned int BlockOffset; - uint64_t BlockAddress; + int64_t BlockAddress; bool IsOpen; bool IsWriteOnly; bool IsWriteCompressed; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; -}; -// ------------------------------------------------------------- -// static 'utility' method implementations + struct RaiiWrapper { + RaiiWrapper(void); + ~RaiiWrapper(void); + char* UncompressedBlock; + char* CompressedBlock; + FILE* Stream; + }; + RaiiWrapper Resources; -// checks BGZF block header -inline -bool BgzfStream::CheckBlockHeader(char* header) { - return (header[0] == Constants::GZIP_ID1 && - header[1] == (char)Constants::GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & Constants::FLG_FEXTRA) != 0 && - BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && - header[12] == Constants::BGZF_ID1 && - header[13] == Constants::BGZF_ID2 && - BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); -} +}; } // namespace Internal } // namespace BamTools diff --git a/src/api/internal/SamFormatParser_p.cpp b/src/api/internal/SamFormatParser_p.cpp index 1431b7a..b92e6fd 100644 --- a/src/api/internal/SamFormatParser_p.cpp +++ b/src/api/internal/SamFormatParser_p.cpp @@ -2,13 +2,14 @@ // SamFormatParser.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 19 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for parsing SAM header text into SamHeader object // *************************************************************************** #include #include +#include #include using namespace BamTools; using namespace BamTools::Internal; @@ -43,7 +44,7 @@ void SamFormatParser::Parse(const string& headerText) { void SamFormatParser::ParseSamLine(const string& line) { // skip if line is not long enough to contain true values - if (line.length() < 5 ) return; + if ( line.length() < 5 ) return; // determine token at beginning of line const string firstToken = line.substr(0,3); @@ -53,8 +54,10 @@ void SamFormatParser::ParseSamLine(const string& line) { else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine); else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine); else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine); - else - cerr << "SamFormatParser ERROR: unknown token: " << firstToken << endl; + else { + const string message = string("unknown token: ") + firstToken; + throw BamException("SamFormatParser::ParseSamLine", message); + } } void SamFormatParser::ParseHDLine(const string& line) { @@ -75,13 +78,15 @@ void SamFormatParser::ParseHDLine(const string& line) { if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue; else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue; else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; - else - cerr << "SamFormatParser ERROR: unknown HD tag: " << tokenTag << endl; + else { + const string message = string("unknown HD tag: ") + tokenTag; + throw BamException("SamFormatParser::ParseHDLine", message); + } } - // if @HD line exists, VN must be provided + // check for required tags if ( !m_header.HasVersion() ) - cerr << "SamFormatParser ERROR: @HD line is missing VN tag" << endl; + throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag"); } void SamFormatParser::ParseSQLine(const string& line) { @@ -107,27 +112,20 @@ void SamFormatParser::ParseSQLine(const string& line) { else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue; else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue; else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue; - else - cerr << "SamFormatParser ERROR: unknown SQ tag: " << tokenTag << endl; + else { + const string message = string("unknown SQ tag: ") + tokenTag; + throw BamException("SamFormatParser::ParseSQLine", message); + } } - bool isMissingRequiredFields = false; - - // if @SQ line exists, SN must be provided - if ( !seq.HasName() ) { - isMissingRequiredFields = true; - cerr << "SamFormatParser ERROR: @SQ line is missing SN tag" << endl; - } - - // if @SQ line exists, LN must be provided - if ( !seq.HasLength() ) { - isMissingRequiredFields = true; - cerr << "SamFormatParser ERROR: @SQ line is missing LN tag" << endl; - } + // check for required tags + if ( !seq.HasName() ) + throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag"); + if ( !seq.HasLength() ) + throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag"); // store SAM sequence entry - if ( !isMissingRequiredFields ) - m_header.Sequences.Add(seq); + m_header.Sequences.Add(seq); } void SamFormatParser::ParseRGLine(const string& line) { @@ -159,21 +157,18 @@ void SamFormatParser::ParseRGLine(const string& line) { else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue; else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue; else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue; - else - cerr << "SamFormatParser ERROR: unknown RG tag: " << tokenTag << endl; + else { + const string message = string("unknown RG tag: ") + tokenTag; + throw BamException("SamFormatParser::ParseRGLine", message); + } } - bool isMissingRequiredFields = false; - - // if @RG line exists, ID must be provided - if ( !rg.HasID() ) { - isMissingRequiredFields = true; - cerr << "SamFormatParser ERROR: @RG line is missing ID tag" << endl; - } + // check for required tags + if ( !rg.HasID() ) + throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag"); // store SAM read group entry - if ( !isMissingRequiredFields ) - m_header.ReadGroups.Add(rg); + m_header.ReadGroups.Add(rg); } void SamFormatParser::ParsePGLine(const string& line) { @@ -198,21 +193,18 @@ void SamFormatParser::ParsePGLine(const string& line) { else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue; else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue; else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue; - else - cerr << "SamFormatParser ERROR: unknown PG tag: " << tokenTag << endl; + else { + const string message = string("unknown PG tag: ") + tokenTag; + throw BamException("SamFormatParser::ParsePGLine", message); + } } - bool isMissingRequiredFields = false; - - // if @PG line exists, ID must be provided - if ( !pg.HasID() ) { - isMissingRequiredFields = true; - cerr << "SamFormatParser ERROR: @PG line is missing ID tag" << endl; - } + // check for required tags + if ( !pg.HasID() ) + throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag"); - // store SAM program record - if ( !isMissingRequiredFields ) - m_header.Programs.Add(pg); + // store SAM program entry + m_header.Programs.Add(pg); } void SamFormatParser::ParseCOLine(const string& line) { diff --git a/src/api/internal/SamFormatPrinter_p.cpp b/src/api/internal/SamFormatPrinter_p.cpp index df13fe5..d81db5d 100644 --- a/src/api/internal/SamFormatPrinter_p.cpp +++ b/src/api/internal/SamFormatPrinter_p.cpp @@ -2,7 +2,7 @@ // SamFormatPrinter.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 19 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for printing formatted SAM header to string // *************************************************************************** @@ -18,16 +18,25 @@ using namespace BamTools::Internal; #include using namespace std; +// ------------------------ +// static utility methods +// ------------------------ + +static inline +const string FormatTag(const string& tag, const string& value) { + return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value); +} + +// --------------------------------- +// SamFormatPrinter implementation +// --------------------------------- + SamFormatPrinter::SamFormatPrinter(const SamHeader& header) : m_header(header) { } SamFormatPrinter::~SamFormatPrinter(void) { } -const string SamFormatPrinter::FormatTag(const string &tag, const string &value) const { - return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value); -} - const string SamFormatPrinter::ToString(void) const { // clear out stream diff --git a/src/api/internal/SamFormatPrinter_p.h b/src/api/internal/SamFormatPrinter_p.h index 8250db6..ea29181 100644 --- a/src/api/internal/SamFormatPrinter_p.h +++ b/src/api/internal/SamFormatPrinter_p.h @@ -2,7 +2,7 @@ // SamFormatPrinter.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for printing formatted SAM header to string // *************************************************************************** @@ -42,7 +42,6 @@ class SamFormatPrinter { // internal methods private: - const std::string FormatTag(const std::string& tag, const std::string& value) const; void PrintHD(std::stringstream& out) const; void PrintSQ(std::stringstream& out) const; void PrintRG(std::stringstream& out) const; diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp index 936da49..75e2c13 100644 --- a/src/api/internal/SamHeaderValidator_p.cpp +++ b/src/api/internal/SamHeaderValidator_p.cpp @@ -2,7 +2,7 @@ // SamHeaderValidator.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 18 April 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for validating SamHeader data // *************************************************************************** @@ -15,14 +15,15 @@ using namespace BamTools; using namespace BamTools::Internal; #include -#include #include #include using namespace std; -namespace BamTools { -namespace Internal { +// ------------------------ +// static utility methods +// ------------------------- +static bool caseInsensitiveCompare(const string& lhs, const string& rhs) { // can omit checking chars if lengths not equal @@ -41,9 +42,6 @@ bool caseInsensitiveCompare(const string& lhs, const string& rhs) { return true; } -} // namespace Internal -} // namespace BamTools - // ------------------------------------------------------------------------ // Allow validation rules to vary, as needed, between SAM header versions // @@ -76,25 +74,62 @@ SamHeaderValidator::SamHeaderValidator(const SamHeader& header) SamHeaderValidator::~SamHeaderValidator(void) { } -bool SamHeaderValidator::Validate(bool verbose) { +void SamHeaderValidator::AddError(const string& message) { + m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::AddWarning(const string& message) { + m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::PrintErrorMessages(ostream& stream) { + + // skip if no error messages + if ( m_errorMessages.empty() ) + return; + + // print error header line + stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + + // print each error message + vector::const_iterator errorIter = m_errorMessages.begin(); + vector::const_iterator errorEnd = m_errorMessages.end(); + for ( ; errorIter != errorEnd; ++errorIter ) + stream << (*errorIter); +} + +void SamHeaderValidator::PrintMessages(ostream& stream) { + PrintErrorMessages(stream); + PrintWarningMessages(stream); +} + +void SamHeaderValidator::PrintWarningMessages(ostream& stream) { + + // skip if no warning messages + if ( m_warningMessages.empty() ) + return; + + // print warning header line + stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + + // print each warning message + vector::const_iterator warnIter = m_warningMessages.begin(); + vector::const_iterator warnEnd = m_warningMessages.end(); + for ( ; warnIter != warnEnd; ++warnIter ) + stream << (*warnIter); +} - // validate header components +// entry point for validation +bool SamHeaderValidator::Validate(void) { bool isValid = true; isValid &= ValidateMetadata(); isValid &= ValidateSequenceDictionary(); isValid &= ValidateReadGroupDictionary(); isValid &= ValidateProgramChain(); - - // report errors if desired - if ( verbose ) { - PrintErrorMessages(); - PrintWarningMessages(); - } - - // return validation status return isValid; } +// check all SAM header 'metadata' bool SamHeaderValidator::ValidateMetadata(void) { bool isValid = true; isValid &= ValidateVersion(); @@ -103,6 +138,7 @@ bool SamHeaderValidator::ValidateMetadata(void) { return isValid; } +// check SAM header version tag bool SamHeaderValidator::ValidateVersion(void) { const string& version = m_header.Version; @@ -147,6 +183,7 @@ bool SamHeaderValidator::ContainsOnlyDigits(const string& s) { return ( nonDigitPosition == string::npos ) ; } +// validate SAM header sort order tag bool SamHeaderValidator::ValidateSortOrder(void) { const string& sortOrder = m_header.SortOrder; @@ -171,6 +208,7 @@ bool SamHeaderValidator::ValidateSortOrder(void) { return false; } +// validate SAM header group order tag bool SamHeaderValidator::ValidateGroupOrder(void) { const string& groupOrder = m_header.GroupOrder; @@ -193,6 +231,7 @@ bool SamHeaderValidator::ValidateGroupOrder(void) { return false; } +// validate SAM header sequence dictionary bool SamHeaderValidator::ValidateSequenceDictionary(void) { bool isValid = true; @@ -213,6 +252,7 @@ bool SamHeaderValidator::ValidateSequenceDictionary(void) { return isValid; } +// make sure all SQ names are unique bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { bool isValid = true; @@ -244,6 +284,7 @@ bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { return isValid; } +// validate SAM header sequence entry bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { bool isValid = true; isValid &= CheckNameFormat(seq.Name); @@ -251,6 +292,7 @@ bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { return isValid; } +// check sequence name is valid format bool SamHeaderValidator::CheckNameFormat(const string& name) { // invalid if name is empty @@ -269,6 +311,7 @@ bool SamHeaderValidator::CheckNameFormat(const string& name) { return true; } +// check that sequence length is within accepted range bool SamHeaderValidator::CheckLengthInRange(const string& length) { // invalid if empty @@ -292,6 +335,7 @@ bool SamHeaderValidator::CheckLengthInRange(const string& length) { return true; } +// validate SAM header read group dictionary bool SamHeaderValidator::ValidateReadGroupDictionary(void) { bool isValid = true; @@ -312,6 +356,7 @@ bool SamHeaderValidator::ValidateReadGroupDictionary(void) { return isValid; } +// make sure RG IDs and platform units are unique bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { bool isValid = true; @@ -364,6 +409,7 @@ bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { return isValid; } +// validate SAM header read group entry bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { bool isValid = true; isValid &= CheckReadGroupID(rg.ID); @@ -371,6 +417,7 @@ bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { return isValid; } +// make sure RG ID exists bool SamHeaderValidator::CheckReadGroupID(const string& id) { // invalid if empty @@ -383,6 +430,7 @@ bool SamHeaderValidator::CheckReadGroupID(const string& id) { return true; } +// make sure RG sequencing tech is one of the accepted keywords bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { // if no technology provided, no problem, just return OK @@ -407,6 +455,7 @@ bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { return false; } +// validate the SAM header "program chain" bool SamHeaderValidator::ValidateProgramChain(void) { bool isValid = true; isValid &= ContainsUniqueProgramIds(); @@ -414,6 +463,7 @@ bool SamHeaderValidator::ValidateProgramChain(void) { return isValid; } +// make sure all PG IDs are unique bool SamHeaderValidator::ContainsUniqueProgramIds(void) { bool isValid = true; @@ -445,6 +495,7 @@ bool SamHeaderValidator::ContainsUniqueProgramIds(void) { return isValid; } +// make sure that any PP tags present point to existing @PG IDs bool SamHeaderValidator::ValidatePreviousProgramIds(void) { bool isValid = true; @@ -471,40 +522,3 @@ bool SamHeaderValidator::ValidatePreviousProgramIds(void) { // return validation state return isValid; } -void SamHeaderValidator::AddError(const string& message) { - m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); -} - -void SamHeaderValidator::AddWarning(const string& message) { - m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); -} - -void SamHeaderValidator::PrintErrorMessages(void) { - - // skip if no error messages - if ( m_errorMessages.empty() ) return; - - // print error header line - cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; - - // print each error message - vector::const_iterator errorIter = m_errorMessages.begin(); - vector::const_iterator errorEnd = m_errorMessages.end(); - for ( ; errorIter != errorEnd; ++errorIter ) - cerr << (*errorIter); -} - -void SamHeaderValidator::PrintWarningMessages(void) { - - // skip if no warning messages - if ( m_warningMessages.empty() ) return; - - // print warning header line - cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; - - // print each warning message - vector::const_iterator warnIter = m_warningMessages.begin(); - vector::const_iterator warnEnd = m_warningMessages.end(); - for ( ; warnIter != warnEnd; ++warnIter ) - cerr << (*warnIter); -} diff --git a/src/api/internal/SamHeaderValidator_p.h b/src/api/internal/SamHeaderValidator_p.h index 1fc1644..7d0c60a 100644 --- a/src/api/internal/SamHeaderValidator_p.h +++ b/src/api/internal/SamHeaderValidator_p.h @@ -2,7 +2,7 @@ // SamHeaderValidator.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 13 January 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for validating SamHeader data // *************************************************************************** @@ -20,6 +20,7 @@ // // We mean it. +#include #include #include @@ -40,9 +41,12 @@ class SamHeaderValidator { // SamHeaderValidator interface public: + + // prints error & warning messages + void PrintMessages(std::ostream& stream); + // validates SamHeader data, returns true/false accordingly - // prints error & warning messages to stderr when @verbose is true - bool Validate(bool verbose = false); + bool Validate(void); // internal methods private: @@ -76,8 +80,8 @@ class SamHeaderValidator { // error reporting void AddError(const std::string& message); void AddWarning(const std::string& message); - void PrintErrorMessages(void); - void PrintWarningMessages(void); + void PrintErrorMessages(std::ostream& stream); + void PrintWarningMessages(std::ostream& stream); // data members private: -- 2.39.2