From 574a2bfb36f7107529e7ccda0f75e70a493460e5 Mon Sep 17 00:00:00 2001 From: Derek Barnett Date: Mon, 14 Jan 2013 20:31:23 -0500 Subject: [PATCH] Added explicit merge order to BamMultiReader --- CMakeLists.txt | 2 +- docs/Doxyfile | 2 +- src/api/BamMultiReader.cpp | 56 +++++++++++++- src/api/BamMultiReader.h | 14 +++- src/api/internal/bam/BamMultiReader_p.cpp | 90 ++++++++++++++++++----- src/api/internal/bam/BamMultiReader_p.h | 10 ++- src/toolkit/CMakeLists.txt | 2 +- 7 files changed, 150 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e3dc8e..3e81b1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ ensure_out_of_source_build( " # set BamTools version information set( BamTools_VERSION_MAJOR 2 ) set( BamTools_VERSION_MINOR 2 ) -set( BamTools_VERSION_BUILD 2 ) +set( BamTools_VERSION_BUILD 3 ) # set our library and executable destination dirs set( EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin" ) diff --git a/docs/Doxyfile b/docs/Doxyfile index c2ff078..ff88c61 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -31,7 +31,7 @@ PROJECT_NAME = BamTools # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 2.2.2 +PROJECT_NUMBER = 2.2.3 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index f61aa26..57c826d 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -2,7 +2,7 @@ // BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 14 January 2013 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // @@ -24,6 +24,18 @@ using namespace std; \brief Convenience class for reading multiple BAM files. */ +/*! \enum BamMultiReader::MergeOrder + \brief A description of the enum type. +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByCoordinate + \brief The description of the first enum value. +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByName + \brief BAM files are +*/ + + + /*! \fn BamMultiReader::BamMultiReader(void) \brief constructor */ @@ -130,6 +142,16 @@ std::string BamMultiReader::GetHeaderText(void) const { return d->GetHeaderText(); } +/*! \fn BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const + \brief Returns curent merge order strategy. + + \returns current merge order enum value + \sa BamMultiReader::MergeOrder, SetExplicitMergeOrder() +*/ +BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder(void) const { + return d->GetMergeOrder(); +} + /*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment) \brief Retrieves next available alignment. @@ -141,7 +163,7 @@ std::string BamMultiReader::GetHeaderText(void) const { \param[out] alignment destination for alignment record data \returns \c true if a valid alignment was found - \sa GetNextAlignmentCore(), SetRegion(), BamReader::GetNextAlignment() + \sa GetNextAlignmentCore(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignment() */ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { return d->GetNextAlignment(nextAlignment); @@ -158,7 +180,7 @@ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { \param[out] alignment destination for alignment record data \returns \c true if a valid alignment was found - \sa GetNextAlignment(), SetRegion(), BamReader::GetNextAlignmentCore() + \sa GetNextAlignment(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignmentCore() */ bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { return d->GetNextAlignmentCore(nextAlignment); @@ -321,6 +343,34 @@ bool BamMultiReader::Rewind(void) { return d->Rewind(); } +/*! \fn void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) + \brief Sets an explicit merge order, regardless of the BAM files' SO header tag. + + The default behavior of the BamMultiReader is to check the SO tag in the BAM files' + SAM header text to determine the merge strategy". The merge strategy is used to + determine from which BAM file the next alignment should come when either + GetNextAlignment() or GetNextAlignmentCore() are called. If files share a + 'coordinate' or 'queryname' value for this tag, then the merge strategy is + selected accordingly. If any of them do not match, or if any fileis marked as + 'unsorted', then the merge strategy is simply a round-robin. + + This method allows client code to explicitly override the lookup behavior. This + method can be useful when you know, for example, that your BAM files are sorted + by coordinate but upstream processes did not set the header tag properly. + + \note This method should \bold not be called while reading alignments via + GetNextAlignment() or GetNextAlignmentCore(). For proper results, you should + call this method before (or immediately after) opening files, rewinding, + jumping, etc. but \bold not once alignment fetching has started. There is + nothing in the API to prevent you from doing so, but the results may be + unexpected. + + \sa BamMultiReader::MergeOrder, GetMergeOrder(), GetNextAlignment(), GetNextAlignmentCore() +*/ +void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) { + d->SetExplicitMergeOrder(order); +} + /*! \fn bool BamMultiReader::SetRegion(const BamRegion& region) \brief Sets a target region of interest diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index e5fc9c9..2774562 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -2,7 +2,7 @@ // BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 14 January 2013 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // *************************************************************************** @@ -25,6 +25,14 @@ namespace Internal { class API_EXPORT BamMultiReader { + // enums + public: + // possible merge order strategies + enum MergeOrder { RoundRobinMerge = 0 + , MergeByCoordinate + , MergeByName + }; + // constructor / destructor public: BamMultiReader(void); @@ -43,6 +51,8 @@ class API_EXPORT BamMultiReader { bool CloseFile(const std::string& filename); // returns list of filenames for all open BAM files const std::vector Filenames(void) const; + // returns curent merge order strategy + BamMultiReader::MergeOrder GetMergeOrder(void) const; // returns true if multireader has any open BAM files bool HasOpenReaders(void) const; // performs random-access jump within current BAM files @@ -53,6 +63,8 @@ class API_EXPORT BamMultiReader { bool OpenFile(const std::string& filename); // returns file pointers to beginning of alignments bool Rewind(void); + // sets an explicit merge order, regardless of the BAM files' SO header tag + void SetExplicitMergeOrder(BamMultiReader::MergeOrder order); // sets the target region of interest bool SetRegion(const BamRegion& region); // sets the target region of interest diff --git a/src/api/internal/bam/BamMultiReader_p.cpp b/src/api/internal/bam/BamMultiReader_p.cpp index d3f2b15..e20e3e3 100644 --- a/src/api/internal/bam/BamMultiReader_p.cpp +++ b/src/api/internal/bam/BamMultiReader_p.cpp @@ -2,7 +2,7 @@ // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 14 January 2013 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -25,6 +25,8 @@ using namespace std; // ctor BamMultiReaderPrivate::BamMultiReaderPrivate(void) : m_alignmentCache(0) + , m_hasUserMergeOrder(false) + , m_mergeOrder(BamMultiReader::RoundRobinMerge) { } // dtor @@ -115,11 +117,19 @@ bool BamMultiReaderPrivate::CloseFiles(const vector& filenames) { } } - // make sure alignment cache is cleaned up if all readers closed - if ( m_readers.empty() && m_alignmentCache ) { - m_alignmentCache->Clear(); - delete m_alignmentCache; - m_alignmentCache = 0; + // make sure we clean up properly if all readers were closed + if ( m_readers.empty() ) { + + // clean up merger + if ( m_alignmentCache ) { + m_alignmentCache->Clear(); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // reset merge flags + m_hasUserMergeOrder = false; + m_mergeOrder = BamMultiReader::RoundRobinMerge; } // return whether all readers closed OK @@ -161,21 +171,46 @@ bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { return true; } -IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const { +IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) { + + // if no merge order set explicitly, use SAM header to lookup proper order + if ( !m_hasUserMergeOrder ) { + + // fetch SamHeader from BAM files + SamHeader header = GetHeader(); + + // if BAM files are sorted by position + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) + m_mergeOrder = BamMultiReader::MergeByCoordinate; + + // if BAM files are sorted by read name + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) + m_mergeOrder = BamMultiReader::MergeByName; + + // otherwise, sorting is either "unknown" or marked as "unsorted" + else + m_mergeOrder = BamMultiReader::RoundRobinMerge; + } + + // use current merge order to create proper 'multi-merger' + switch ( m_mergeOrder ) { - // fetch SamHeader - SamHeader header = GetHeader(); + // merge BAM files by position + case BamMultiReader::MergeByCoordinate : + return new MultiMerger(); - // if BAM files are sorted by position - if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) - return new MultiMerger(); + // merge BAM files by read name + case BamMultiReader::MergeByName : + return new MultiMerger(); - // if BAM files are sorted by read name - if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) - return new MultiMerger(); + // sorting is "unknown", "unsorted" or "ignored"... so use unsorted merger + case BamMultiReader::RoundRobinMerge : + return new MultiMerger(); - // otherwise "unknown" or "unsorted", use unsorted merger and just read in - return new MultiMerger(); + // unknown merge order, can't create merger + default: + return 0; + } } const vector BamMultiReaderPrivate::Filenames(void) const { @@ -248,6 +283,10 @@ string BamMultiReaderPrivate::GetHeaderText(void) const { return mergedHeader.ToString(); } +BamMultiReader::MergeOrder BamMultiReaderPrivate::GetMergeOrder(void) const { + return m_mergeOrder; +} + // get next alignment among all files bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { return PopNextCachedAlignment(al, true); @@ -622,6 +661,23 @@ void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* a m_alignmentCache->Add( MergeItem(reader, alignment) ); } +void BamMultiReaderPrivate::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) { + + // set new merge flags + m_hasUserMergeOrder = true; + m_mergeOrder = order; + + // remove any existing merger + if ( m_alignmentCache ) { + m_alignmentCache->Clear(); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // update cache with new strategy + UpdateAlignmentCache(); +} + void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const { static const string SEPARATOR = ": "; m_errorString = where + SEPARATOR + what; diff --git a/src/api/internal/bam/BamMultiReader_p.h b/src/api/internal/bam/BamMultiReader_p.h index 9d7c39a..c84a8cf 100644 --- a/src/api/internal/bam/BamMultiReader_p.h +++ b/src/api/internal/bam/BamMultiReader_p.h @@ -2,7 +2,7 @@ // BamMultiReader_p.h (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 25 October 2011 (DB) +// Last modified: 14 January 2013 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -54,9 +54,11 @@ class BamMultiReaderPrivate { bool SetRegion(const BamRegion& region); // access alignment data + BamMultiReader::MergeOrder GetMergeOrder(void) const; bool GetNextAlignment(BamAlignment& al); bool GetNextAlignmentCore(BamAlignment& al); bool HasOpenReaders(void); + void SetExplicitMergeOrder(BamMultiReader::MergeOrder order); // access auxiliary data SamHeader GetHeader(void) const; @@ -78,7 +80,7 @@ class BamMultiReaderPrivate { public: bool CloseFiles(const std::vector& filenames); - IMultiMerger* CreateAlignmentCache(void) const; + IMultiMerger* CreateAlignmentCache(void); bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData); bool RewindReaders(void); void SaveNextAlignment(BamReader* reader, BamAlignment* alignment); @@ -90,6 +92,10 @@ class BamMultiReaderPrivate { public: std::vector m_readers; IMultiMerger* m_alignmentCache; + + bool m_hasUserMergeOrder; + BamMultiReader::MergeOrder m_mergeOrder; + mutable std::string m_errorString; }; diff --git a/src/toolkit/CMakeLists.txt b/src/toolkit/CMakeLists.txt index ba5b640..1be9efb 100644 --- a/src/toolkit/CMakeLists.txt +++ b/src/toolkit/CMakeLists.txt @@ -31,7 +31,7 @@ add_executable( bamtools_cmd # set BamTools application properties set_target_properties( bamtools_cmd PROPERTIES - VERSION 2.2.2 + VERSION 2.2.3 OUTPUT_NAME "bamtools" ) # make version info available in application -- 2.39.2