From: derek Date: Wed, 12 Oct 2011 20:30:59 +0000 (-0400) Subject: Major speedup in SamSequenceDictionary & SamReadGroupDictionary classes X-Git-Url: https://git.donarmstrong.com/?p=bamtools.git;a=commitdiff_plain;h=270452a6f0a98cb1e4260b5501fe780c9b1806c0 Major speedup in SamSequenceDictionary & SamReadGroupDictionary classes * Please note that this does introduce a minor source-incompatibility, only affecting those working directly with the provided Sam*Iterator typedefs. The short answer is that the iterator now references a std::pair instead of the 'plain old' data. Use the pair's "second" field to access the desired SamSequence or SamReadGroup. * Doxygen docs have been updated to reflect this and provide a bit more explanation/examples (in docs folder run 'doxygen Doxyfile' to get the updated API pages). --- diff --git a/CMakeLists.txt b/CMakeLists.txt index a04cf38..7ee7f83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ ensure_out_of_source_build (" # set BamTools version information set (BamTools_VERSION_MAJOR 2) set (BamTools_VERSION_MINOR 0) -set (BamTools_VERSION_BUILD 0) +set (BamTools_VERSION_BUILD 1) # set our library and executable destination dirs set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin") diff --git a/docs/Doxyfile b/docs/Doxyfile index 27731c9..fb500ad 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -31,7 +31,7 @@ PROJECT_NAME = BamTools # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 2.0.0 +PROJECT_NUMBER = 2.0.1 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index c85b71b..fcd0961 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -49,7 +49,7 @@ set( BamToolsAPISources # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) -set_target_properties( BamTools PROPERTIES SOVERSION "2.0.0" ) +set_target_properties( BamTools PROPERTIES SOVERSION "2.0.1" ) set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) # create main BamTools API static library diff --git a/src/api/SamReadGroupDictionary.cpp b/src/api/SamReadGroupDictionary.cpp index c501773..7b8fad0 100644 --- a/src/api/SamReadGroupDictionary.cpp +++ b/src/api/SamReadGroupDictionary.cpp @@ -2,7 +2,7 @@ // SamReadGroupDictionary.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 12 October 2011 (DB) // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamReadGroup entries. // *************************************************************************** @@ -10,7 +10,6 @@ #include "api/SamReadGroupDictionary.h" using namespace BamTools; -#include #include using namespace std; @@ -20,6 +19,32 @@ using namespace std; Provides methods for operating on a collection of SamReadGroup entries. */ +/*! \typedef BamTools::SamReadGroupIterator + \brief mutable iterator for SamReadGroupDictionary data + + \note This iterator, dereferenced, actually points to a + std::pair, NOT a "plain old" SamReadGroup. + To retrieve the read group object: + + \code + SamReadGroupIterator iter; + SamReadGroup& rg = (*iter).second // OR iter->second; + \endcode +*/ + +/*! \typedef BamTools::SamReadGroupConstIterator + \brief const iterator for SamReadGroupDictionary data + + \note This iterator, dereferenced, actually points to a + std::pair, NOT a "plain old" SamReadGroup. + To retrieve the read group object: + + \code + SamReadGroupConstIterator iter; + const SamReadGroup& sq = (*iter).second // OR iter->second; + \endcode +*/ + /*! \fn SamReadGroupDictionary::SamReadGroupDictionary(void) \brief constructor */ @@ -46,7 +71,7 @@ SamReadGroupDictionary::~SamReadGroupDictionary(void) { } */ void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) { if ( IsEmpty() || !Contains(readGroup) ) - m_data.push_back(readGroup); + m_data[readGroup.ID] = readGroup; } /*! \fn void SamReadGroupDictionary::Add(const std::string& readGroupId) @@ -73,7 +98,7 @@ void SamReadGroupDictionary::Add(const SamReadGroupDictionary& readGroups) { SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); for ( ; rgIter != rgEnd; ++rgIter ) - Add(*rgIter); + Add(rgIter->second); } /*! \fn void SamReadGroupDictionary::Add(const std::vector& readGroups) @@ -155,7 +180,7 @@ SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const { \return \c true if dictionary contains a read group with this ID */ bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const { - return ( IndexOf(readGroupId) != (int)m_data.size() ); + return ( m_data.find(readGroupId) != m_data.end() ); } /*! \fn bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const @@ -167,7 +192,7 @@ bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const { \return \c true if dictionary contains read group (matching on ID). */ bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const { - return Contains( readGroup.ID ); + return Contains(readGroup.ID); } /*! \fn SamReadGroupIterator SamReadGroupDictionary::End(void) @@ -189,22 +214,6 @@ SamReadGroupConstIterator SamReadGroupDictionary::End(void) const { return m_data.end(); } -/*! \fn int SamReadGroupDictionary::IndexOf(const std::string& readGroupId) const - \internal - \return index of read group if found. Otherwise, returns vector::size() (invalid index). -*/ -int SamReadGroupDictionary::IndexOf(const std::string& readGroupId) const { - SamReadGroupConstIterator begin = ConstBegin(); - SamReadGroupConstIterator iter = begin; - SamReadGroupConstIterator end = ConstEnd(); - for ( ; iter != end; ++iter ) { - const SamReadGroup& current = (*iter); - if ( current.ID == readGroupId ) - break; - } - return distance( begin, iter ); -} - /*! \fn bool SamReadGroupDictionary::IsEmpty(void) const \brief Returns \c true if dictionary contains no read groups \sa Size() @@ -221,7 +230,7 @@ bool SamReadGroupDictionary::IsEmpty(void) const { \param[in] readGroup read group to remove (matches on ID) */ void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) { - Remove( readGroup.ID ); + Remove(readGroup.ID); } /*! \fn void SamReadGroupDictionary::Remove(const std::string& readGroupId) @@ -231,8 +240,7 @@ void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) { \sa Remove() */ void SamReadGroupDictionary::Remove(const std::string& readGroupId) { - if ( Contains(readGroupId) ) - m_data.erase( m_data.begin() + IndexOf(readGroupId) ); + m_data.erase(readGroupId); } /*! \fn void SamReadGroupDictionary::Remove(const std::vector& readGroups) @@ -284,18 +292,7 @@ int SamReadGroupDictionary::Size(void) const { \return a modifiable reference to the SamReadGroup associated with the ID */ SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) { - - // look up read group ID - int index = IndexOf(readGroupId); - - // if found, return read group at index - if ( index != (int)m_data.size() ) - return m_data[index]; - - // otherwise, append new read group and return reference - else { - SamReadGroup rg(readGroupId); - m_data.push_back(rg); - return m_data.back(); - } + if ( !Contains(readGroupId) ) + m_data[readGroupId] = SamReadGroup(readGroupId); + return m_data[readGroupId]; } diff --git a/src/api/SamReadGroupDictionary.h b/src/api/SamReadGroupDictionary.h index 7a651e1..5aa44ab 100644 --- a/src/api/SamReadGroupDictionary.h +++ b/src/api/SamReadGroupDictionary.h @@ -2,7 +2,7 @@ // SamReadGroupDictionary.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 12 October 2011 (DB) // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamReadGroup entries. // *************************************************************************** @@ -12,12 +12,13 @@ #include "api/api_global.h" #include "api/SamReadGroup.h" +#include #include #include namespace BamTools { -typedef std::vector SamReadGroupContainer; +typedef std::map SamReadGroupContainer; typedef SamReadGroupContainer::iterator SamReadGroupIterator; typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator; @@ -73,10 +74,6 @@ class API_EXPORT SamReadGroupDictionary { SamReadGroupConstIterator End(void) const; // returns const_iterator to end() SamReadGroupConstIterator ConstEnd(void) const; // returns const_iterator to end() - // internal methods - private: - int IndexOf(const std::string& readGroupId) const; - // data members private: SamReadGroupContainer m_data; diff --git a/src/api/SamSequenceDictionary.cpp b/src/api/SamSequenceDictionary.cpp index 61eae25..80042d6 100644 --- a/src/api/SamSequenceDictionary.cpp +++ b/src/api/SamSequenceDictionary.cpp @@ -2,12 +2,12 @@ // SamSequenceDictionary.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 12 October 2011 (DB) // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamSequence entries. // ************************************************************************* -#include +#include "api/SamSequenceDictionary.h" using namespace BamTools; #include @@ -19,6 +19,30 @@ using namespace std; Provides methods for operating on a collection of SamSequence entries. */ +/*! \typedef BamTools::SamSequenceIterator + \brief mutable iterator for SamSequenceDictionary data + + \note This iterator, dereferenced, points to a std::pair, NOT + a "plain old" SamSequence. To retrieve the sequence: + + \code + SamSequenceIterator iter; + SamSequence& sq = (*iter).second // OR iter->second; + \endcode +*/ + +/*! \typedef BamTools::SamSequenceConstIterator + \brief const iterator for SamSequenceDictionary data + + \note This iterator, dereferenced, points to a std::pair, NOT + a "plain old" SamSequence. To retrieve the sequence: + + \code + SamSequenceConstIterator iter; + const SamSequence& sq = (*iter).second // OR iter->second; + \endcode +*/ + /*! \fn SamSequenceDictionary::SamSequenceDictionary(void) \brief constructor */ @@ -45,7 +69,7 @@ SamSequenceDictionary::~SamSequenceDictionary(void) { } */ void SamSequenceDictionary::Add(const SamSequence& sequence) { if ( IsEmpty() || !Contains(sequence) ) - m_data.push_back(sequence); + m_data[sequence.Name] = sequence; } /*! \fn void SamSequenceDictionary::Add(const std::string& name, const int& length) @@ -73,7 +97,7 @@ void SamSequenceDictionary::Add(const SamSequenceDictionary& sequences) { SamSequenceConstIterator seqIter = sequences.ConstBegin(); SamSequenceConstIterator seqEnd = sequences.ConstEnd(); for ( ; seqIter != seqEnd; ++seqIter ) - Add(*seqIter); + Add(seqIter->second); } /*! \fn void SamSequenceDictionary::Add(const std::vector& sequences) @@ -158,7 +182,7 @@ SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const { \return \c true if dictionary contains a sequence with this name */ bool SamSequenceDictionary::Contains(const std::string& sequenceName) const { - return ( IndexOf(sequenceName) != (int)m_data.size() ); + return ( m_data.find(sequenceName) != m_data.end() ); } /*! \fn bool SamSequenceDictionary::Contains(const SamSequence& sequence) const @@ -170,7 +194,7 @@ bool SamSequenceDictionary::Contains(const std::string& sequenceName) const { \return \c true if dictionary contains sequence (matching on name) */ bool SamSequenceDictionary::Contains(const SamSequence& sequence) const { - return ( IndexOf(sequence.Name) != (int)m_data.size() ); + return Contains(sequence.Name); } /*! \fn SamSequenceIterator SamSequenceDictionary::End(void) @@ -192,22 +216,6 @@ SamSequenceConstIterator SamSequenceDictionary::End(void) const { return m_data.end(); } -/*! \fn int SamSequenceDictionary::IndexOf(const std::string& name) const - \internal - \return index of sequence if found (matching on name). Otherwise, returns vector::size() (invalid index). -*/ -int SamSequenceDictionary::IndexOf(const std::string& name) const { - SamSequenceConstIterator begin = ConstBegin(); - SamSequenceConstIterator iter = begin; - SamSequenceConstIterator end = ConstEnd(); - for ( ; iter != end; ++iter ) { - const SamSequence& currentSeq = (*iter); - if ( currentSeq.Name == name ) - break; - } - return distance( begin, iter ); -} - /*! \fn bool SamSequenceDictionary::IsEmpty(void) const \brief Returns \c true if dictionary contains no sequences \sa Size() @@ -224,7 +232,7 @@ bool SamSequenceDictionary::IsEmpty(void) const { \param[in] sequence SamSequence to remove (matching on name) */ void SamSequenceDictionary::Remove(const SamSequence& sequence) { - Remove( sequence.Name ); + Remove(sequence.Name); } /*! \fn void SamSequenceDictionary::Remove(const std::string& sequenceName) @@ -234,8 +242,7 @@ void SamSequenceDictionary::Remove(const SamSequence& sequence) { \sa Remove() */ void SamSequenceDictionary::Remove(const std::string& sequenceName) { - if ( Contains(sequenceName) ) - m_data.erase( m_data.begin() + IndexOf(sequenceName) ); + m_data.erase(sequenceName); } /*! \fn void SamSequenceDictionary::Remove(const std::vector& sequences) @@ -287,17 +294,7 @@ int SamSequenceDictionary::Size(void) const { \return a modifiable reference to the SamSequence associated with the name */ SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) { - - // look up sequence ID - int index = IndexOf(sequenceName); - - // if found, return sequence at index - if ( index != (int)m_data.size() ) - return m_data[index]; - - // otherwise, append new sequence and return reference - else { - m_data.push_back( SamSequence(sequenceName, 0) ); - return m_data.back(); - } + if ( !Contains(sequenceName) ) + m_data[sequenceName] = SamSequence(sequenceName, 0); + return m_data[sequenceName]; } diff --git a/src/api/SamSequenceDictionary.h b/src/api/SamSequenceDictionary.h index e8a3600..a583f04 100644 --- a/src/api/SamSequenceDictionary.h +++ b/src/api/SamSequenceDictionary.h @@ -2,7 +2,7 @@ // SamSequenceDictionary.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 +// Last modified: 12 October 2011 // --------------------------------------------------------------------------- // Provides methods for operating on a collection of SamSequence entries. // *************************************************************************** @@ -12,13 +12,13 @@ #include "api/api_global.h" #include "api/SamSequence.h" -#include #include +#include #include namespace BamTools { -typedef std::vector SamSequenceContainer; +typedef std::map SamSequenceContainer; typedef SamSequenceContainer::iterator SamSequenceIterator; typedef SamSequenceContainer::const_iterator SamSequenceConstIterator; @@ -74,10 +74,6 @@ class API_EXPORT SamSequenceDictionary { SamSequenceConstIterator End(void) const; // returns const_iterator to end() SamSequenceConstIterator ConstEnd(void) const; // returns const_iterator to end() - // internal methods - private: - int IndexOf(const std::string& name) const; - // data members private: SamSequenceContainer m_data; diff --git a/src/api/internal/SamFormatPrinter_p.cpp b/src/api/internal/SamFormatPrinter_p.cpp index 942a7af..a5b61a0 100644 --- a/src/api/internal/SamFormatPrinter_p.cpp +++ b/src/api/internal/SamFormatPrinter_p.cpp @@ -2,7 +2,7 @@ // SamFormatPrinter.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 12 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for printing formatted SAM header to string // *************************************************************************** @@ -81,7 +81,7 @@ void SamFormatPrinter::PrintSQ(std::stringstream& out) const { SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin(); SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd(); for ( ; seqIter != seqEnd; ++seqIter ) { - const SamSequence& seq = (*seqIter); + const SamSequence& seq = seqIter->second; // @SQ SN: LN: out << Constants::SAM_SQ_BEGIN_TOKEN @@ -115,7 +115,7 @@ void SamFormatPrinter::PrintRG(std::stringstream& out) const { SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin(); SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd(); for ( ; rgIter != rgEnd; ++rgIter ) { - const SamReadGroup& rg = (*rgIter); + const SamReadGroup& rg = rgIter->second; // @RG ID: out << Constants::SAM_RG_BEGIN_TOKEN diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp index 094e79a..9d7f6d4 100644 --- a/src/api/internal/SamHeaderValidator_p.cpp +++ b/src/api/internal/SamHeaderValidator_p.cpp @@ -2,7 +2,7 @@ // SamHeaderValidator.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 12 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for validating SamHeader data // *************************************************************************** @@ -244,7 +244,7 @@ bool SamHeaderValidator::ValidateSequenceDictionary(void) { SamSequenceConstIterator seqIter = sequences.ConstBegin(); SamSequenceConstIterator seqEnd = sequences.ConstEnd(); for ( ; seqIter != seqEnd; ++seqIter ) { - const SamSequence& seq = (*seqIter); + const SamSequence& seq = seqIter->second; isValid &= ValidateSequence(seq); } @@ -264,7 +264,7 @@ bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { SamSequenceConstIterator seqIter = sequences.ConstBegin(); SamSequenceConstIterator seqEnd = sequences.ConstEnd(); for ( ; seqIter != seqEnd; ++seqIter ) { - const SamSequence& seq = (*seqIter); + const SamSequence& seq = seqIter->second; // lookup sequence name const string& name = seq.Name; @@ -348,7 +348,7 @@ bool SamHeaderValidator::ValidateReadGroupDictionary(void) { SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); for ( ; rgIter != rgEnd; ++rgIter ) { - const SamReadGroup& rg = (*rgIter); + const SamReadGroup& rg = rgIter->second; isValid &= ValidateReadGroup(rg); } @@ -370,7 +370,7 @@ bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); for ( ; rgIter != rgEnd; ++rgIter ) { - const SamReadGroup& rg = (*rgIter); + const SamReadGroup& rg = rgIter->second; // -------------------------------- // check for unique ID diff --git a/src/toolkit/bamtools_resolve.cpp b/src/toolkit/bamtools_resolve.cpp index 9a1a3b0..cdf53ee 100644 --- a/src/toolkit/bamtools_resolve.cpp +++ b/src/toolkit/bamtools_resolve.cpp @@ -2,7 +2,7 @@ // bamtools_resolve.cpp (c) 2011 // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 6 July 2011 +// Last modified: 12 October 2011 // --------------------------------------------------------------------------- // Resolves paired-end reads (marking the IsProperPair flag as needed). // *************************************************************************** @@ -1029,7 +1029,7 @@ void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) { SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin(); SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd(); for ( ; rgIter != rgEnd; ++rgIter ) { - const SamReadGroup& rg = (*rgIter); + const SamReadGroup& rg = rgIter->second; m_readGroups.insert( make_pair(rg.ID, ReadGroupResolver()) ); } }