#ifndef BAMALGORITHMS_H
#define BAMALGORITHMS_H
-#include <api/api_global.h>
-#include <api/BamAlignment.h>
-#include <api/BamReader.h>
-#include <api/BamMultiReader.h>
-#include <algorithm>
-#include <functional>
-#include <string>
-#include <vector>
+#include <api/algorithms/Sort.h>
-namespace BamTools {
-namespace Algorithms {
-
-// -------------------------------------------------------
-// Built-in function objects for comparing BamAlignments
-
-typedef std::binary_function<BamAlignment, BamAlignment, bool> BamAlignmentComparer;
-
-// Algorithms::SortByName<Compare>
-// compare alignments by name (default comparison is std::less<std::string> )
-template<template <typename> class Compare = std::less>
-struct API_EXPORT SortByName : public BamAlignmentComparer {
- bool operator()(const BamTools::BamAlignment& lhs,
- const BamTools::BamAlignment& rhs)
- {
- Compare<std::string> comp;
- return comp(lhs.Name, rhs.Name);
- }
-};
-
-// Algorithms::SortByPosition<Compare>
-// compare alignments by position (default comparison is std::less<int>)
-template<template <typename> class Compare = std::less>
-struct API_EXPORT SortByPosition : public BamAlignmentComparer {
- bool operator()(const BamTools::BamAlignment& lhs,
- const BamTools::BamAlignment& rhs)
- {
- // force unmapped aligmnents to end
- if ( lhs.RefID == -1 ) return false;
- if ( rhs.RefID == -1 ) return true;
-
- // otherwise compare first on RefID, then position
- Compare<int32_t> comp;
- if ( lhs.RefID == rhs.RefID )
- return comp(lhs.Position, rhs.Position);
- return comp(lhs.RefID, rhs.RefID);
- }
-};
-
-// Algorithms::SortByTag<Compare>("XY")
-// compare alignments by tag value (default comparison is std::less<T>)
-// where T is the expected tag type (e.g. RG -> string, NM -> int, etc.)
-template<typename T, template <typename> class Compare = std::less>
-struct API_EXPORT SortByTag : public BamAlignmentComparer {
-
- // ctor - needed to provide the tag name ("RG", "NM", "Aq", etc)
- explicit SortByTag(const std::string& tag) : m_tag(tag) { }
-
- bool operator()(const BamTools::BamAlignment& lhs,
- const BamTools::BamAlignment& rhs)
- {
- // force alignments without tag to end
- T lhsTagValue;
- T rhsTagValue;
- if ( !lhs.GetTag(m_tag, lhsTagValue) ) return false;
- if ( !rhs.GetTag(m_tag, rhsTagValue) ) return true;
-
- // otherwise compare tag values
- Compare<T> comp;
- return comp(lhsTagValue, rhsTagValue);
- }
-
- private:
- std::string m_tag;
-};
-
-// Algorithms::Unsorted
-// placeholder comparison object, ignores the alignments' data
-// N.B. - returning false typically retains initial insertion order
-struct API_EXPORT Unsorted : public BamAlignmentComparer {
- bool operator()(const BamTools::BamAlignment& /*lhs*/,
- const BamTools::BamAlignment& /*rhs*/)
- {
- return false;
- }
-};
-
-API_EXPORT template<typename Compare>
-std::vector<BamAlignment> SortReaderRegion(BamReader& reader,
- const BamRegion& region,
- const Compare& comp = Compare())
-{
- // return empty container if unable to find region
- if ( !reader.IsOpen() ) return std::vector<BamAlignment>();
- if ( !reader.SetRegion(region) ) return std::vector<BamAlignment>();
-
- // iterate through region, grabbing alignments
- BamAlignment al;
- std::vector<BamAlignment> results;
- while ( reader.GetNextAlignment(al) )
- results.push_back(al);
-
- // sort & return alignments
- std::sort(results.begin(), results.end(), comp);
- return results;
-}
-
-API_EXPORT template<typename Compare>
-std::vector<BamAlignment> SortReaderRegion(BamMultiReader& reader,
- const BamRegion& region,
- const Compare& comp = Compare())
-{
- // return empty container if unable to find region
- if ( !reader.HasOpenReaders() ) return std::vector<BamAlignment>();
- if ( !reader.SetRegion(region) ) return std::vector<BamAlignment>();
-
- // iterate through region, grabbing alignments
- BamAlignment al;
- std::vector<BamAlignment> results;
- while ( reader.GetNextAlignment(al) )
- results.push_back(al);
-
- // sort & return alignments
- std::sort(results.begin(), results.end(), comp);
- return results;
-}
-
-} // namespace Algorithms
-} // namespace BamTools
#endif // BAM_ALGORITHMS_H
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 15 March 2011 (DB)
+// Last modified: 1 October 2011 (DB)
// ---------------------------------------------------------------------------
// Convenience class for reading multiple BAM files.
//
// This functionality allows applications to work on very large sets of files
// without requiring intermediate merge, sort, and index steps for each file
-// subset. It also improves the performance of our merge system as it
+// subset. It also improves the performance of our merge system as it
// precludes the need to sort merged files.
// ***************************************************************************
return d->HasOpenReaders();
}
-/*! \fn bool BamMultiReader::IsIndexLoaded(void) const
- \brief Returns \c true if all BAM files have index data available.
-
- \deprecated Instead use HasIndexes()
- \cond
- See explanation in BamReader.cpp for more details on the deprecation decision.
- \endcond
-*/
-
-bool BamMultiReader::IsIndexLoaded(void) const {
- return d->HasIndexes();
-}
-
/*! \fn bool BamMultiReader::Jump(int refID, int position)
\brief Performs a random-access jump within current BAM files.
return d->OpenIndexes(indexFilenames);
}
-/*! \fn void BamMultiReader::PrintFilenames(void) const
- \brief Convenience method for printing filenames to stdout.
- \deprecated Doesn't really belong as an API function. Clients should
- determine how the data is reported.
- \sa Filenames(), BamReader::GetFilename()
-*/
-void BamMultiReader::PrintFilenames(void) const {
- d->PrintFilenames();
-}
-
/*! \fn bool BamMultiReader::Rewind(void)
\brief Returns the internal file pointers to the beginning of alignment records.
BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
return d->SetRegion(region);
}
-
-/*! \fn void BamMultiReader::SetSortOrder(const SortOrder& order)
- \brief Sets the expected sorting order for reading across multiple BAM files.
-
- Default is BamMultiReader::SortedByPosition.
-
- The SortOrder determines how the reader determines which alignment is "next"
- from among its open readers.
-
- \param order expected sort order
-*/
-void BamMultiReader::SetSortOrder(const SortOrder& order) {
- d->SetSortOrder(order);
-}
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 15 March 2011 (DB)
+// Last modified: 1 October 2011 (DB)
// ---------------------------------------------------------------------------
// Convenience class for reading multiple BAM files.
// ***************************************************************************
class API_EXPORT BamMultiReader {
- public:
- enum SortOrder { SortedByPosition = 0
- , SortedByReadName
- , Unsorted
- };
-
// constructor / destructor
public:
BamMultiReader(void);
// retrieves next available alignment
bool GetNextAlignment(BamAlignment& alignment);
- // retrieves next available alignmnet (without populating the alignment's string data fields)
+ // retrieves next available alignment (without populating the alignment's string data fields)
bool GetNextAlignmentCore(BamAlignment& alignment);
- // sets the expected sorting order for reading across multiple BAM files
- void SetSortOrder(const SortOrder& order);
-
// ----------------------
// access auxiliary data
// ----------------------
// changes the caching behavior of the index data
void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode);
- // deprecated methods
- public:
- // returns \c true if all BAM files have index data available.
- bool IsIndexLoaded(void) const;
- // convenience method for printing filenames to stdout
- void PrintFilenames(void) const;
-
// private implementation
private:
Internal::BamMultiReaderPrivate* d;
include(../ExportHeader.cmake)
set(ApiIncludeDir "api")
ExportHeader(APIHeaders api_global.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamAlgorithm.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAlgorithms.h ${ApiIncludeDir})
ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir})
ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir})
ExportHeader(APIHeaders BamConstants.h ${ApiIncludeDir})
ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir})
ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir})
ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir})
+
+set(AlgorithmsIncludeDir "api/algorithms")
+ExportHeader(AlgorithmsHeaders algorithms/Sort.h ${AlgorithmsIncludeDir})
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 18 April 2011 (DB)
+// Last modified: 1 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides methods for operating on a collection of SamReadGroup entries.
// ***************************************************************************
Add( SamReadGroup(readGroupId) );
}
+void SamReadGroupDictionary::Add(const SamReadGroupDictionary& readGroups) {
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for ( ; rgIter != rgEnd; ++rgIter )
+ Add(*rgIter);
+}
+
/*! \fn void SamReadGroupDictionary::Add(const std::vector<SamReadGroup>& readGroups)
\brief Adds multiple read groups to the dictionary.
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 18 April 2011 (DB)
+// Last modified: 1 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides methods for operating on a collection of SamReadGroup entries.
// ***************************************************************************
void Add(const std::string& readGroupId);
// adds multiple read groups
+ void Add(const SamReadGroupDictionary& readGroups);
void Add(const std::vector<SamReadGroup>& readGroups);
void Add(const std::vector<std::string>& readGroupIds);
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 18 April 2011 (DB)
+// Last modified: 1 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides methods for operating on a collection of SamSequence entries.
// *************************************************************************
Add( SamSequence(name, length) );
}
+void SamSequenceDictionary::Add(const SamSequenceDictionary& sequences) {
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for ( ; seqIter != seqEnd; ++seqIter )
+ Add(*seqIter);
+}
+
/*! \fn void SamSequenceDictionary::Add(const std::vector<SamSequence>& sequences)
\brief Adds multiple sequences to the dictionary.
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 18 April 2011
+// Last modified: 1 October 2011
// ---------------------------------------------------------------------------
// Provides methods for operating on a collection of SamSequence entries.
// ***************************************************************************
void Add(const std::string& name, const int& length);
// adds multiple sequences
+ void Add(const SamSequenceDictionary& sequences);
void Add(const std::vector<SamSequence>& sequences);
void Add(const std::map<std::string, int>& sequenceMap);
--- /dev/null
+// ***************************************************************************
+// Sort.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 29 September 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides sorting functionality.
+// ***************************************************************************
+
+#ifndef ALGORITHMS_SORT_H
+#define ALGORITHMS_SORT_H
+
+#include <api/api_global.h>
+#include <api/BamAlignment.h>
+#include <api/BamReader.h>
+#include <api/BamMultiReader.h>
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+namespace Algorithms {
+
+struct API_EXPORT Sort {
+
+ enum Order { AscendingOrder = 0
+ , DescendingOrder
+ };
+
+ template<typename ElemType>
+ static inline bool sort_helper(const Sort::Order& order,
+ const ElemType& lhs,
+ const ElemType& rhs)
+ {
+ switch ( order ) {
+ case ( Sort::AscendingOrder ) : { std::less<ElemType> comp; return comp(lhs, rhs); }
+ case ( Sort::DescendingOrder ) : { std::greater<ElemType> comp; return comp(lhs, rhs); }
+ default : assert(false);
+ }
+ return false; // <-- unreachable
+ }
+
+ typedef std::binary_function<BamAlignment, BamAlignment, bool> AlignmentSortBase;
+
+ // compare alignments by name
+ struct ByName : public AlignmentSortBase {
+
+ ByName(const Sort::Order& order = Sort::AscendingOrder)
+ : m_order(order)
+ { }
+
+ bool operator()(const BamTools::BamAlignment& lhs,
+ const BamTools::BamAlignment& rhs)
+ {
+ return sort_helper(m_order, lhs.Name, rhs.Name);
+ }
+
+ static inline bool UsesCharData(void) { return true; }
+
+ private:
+ const Sort::Order& m_order;
+ };
+
+ // compare alignments by position
+ struct ByPosition : public AlignmentSortBase {
+
+ ByPosition(const Sort::Order& order = Sort::AscendingOrder)
+ : m_order(order)
+ { }
+
+ bool operator()(const BamTools::BamAlignment& lhs,
+ const BamTools::BamAlignment& rhs)
+ {
+ // force unmapped aligmnents to end
+ if ( lhs.RefID == -1 ) return false;
+ if ( rhs.RefID == -1 ) return true;
+
+ // if on same reference, sort on position
+ if ( lhs.RefID == rhs.RefID )
+ return sort_helper(m_order, lhs.Position, rhs.Position);
+
+ // otherwise sort on reference ID
+ return sort_helper(m_order, lhs.RefID, rhs.RefID);
+ }
+
+ static inline bool UsesCharData(void) { return false; }
+
+ private:
+ Sort::Order m_order;
+ };
+
+ // compare alignments by tag value
+ template<typename T>
+ struct ByTag : public AlignmentSortBase {
+
+ ByTag(const std::string& tag,
+ const Sort::Order& order = Sort::AscendingOrder)
+ : m_tag(tag)
+ , m_order(order)
+ { }
+
+ bool operator()(const BamTools::BamAlignment& lhs,
+ const BamTools::BamAlignment& rhs)
+ {
+ // force alignments without tag to end
+ T lhsTagValue;
+ T rhsTagValue;
+ if ( !lhs.GetTag(m_tag, lhsTagValue) ) return false;
+ if ( !rhs.GetTag(m_tag, rhsTagValue) ) return true;
+
+ // otherwise compare on tag values
+ return sort_helper(m_order, lhsTagValue, rhsTagValue);
+ }
+
+ static inline bool UsesCharData(void) { return true; }
+
+ private:
+ std::string m_tag;
+ Sort::Order m_order;
+ };
+
+ // essentially a placeholder comparison object, ignores the alignments' data
+ // N.B. - returning false tends to retain insertion order
+ struct Unsorted : public AlignmentSortBase {
+ inline bool operator()(const BamTools::BamAlignment& /*lhs*/,
+ const BamTools::BamAlignment& /*rhs*/)
+ {
+ return false;
+ }
+
+ static inline bool UsesCharData(void) { return false; }
+ };
+};
+
+// in-place sorting of alignment vector
+template<typename Compare>
+inline
+void SortAlignments(std::vector<BamAlignment>& data,
+ const Compare& comp = Compare())
+{
+ std::sort(data.begin(), data.end(), comp);
+}
+
+// returns sorted copy of input alignment vector
+template<typename Compare>
+inline
+std::vector<BamAlignment> SortAlignmentsCopy(const std::vector<BamAlignment>& input,
+ const Compare& comp = Compare())
+{
+ std::vector<BamAlignment> output(input);
+ SortAlignments(output, comp);
+ return output;
+}
+
+// pulls a region from a position-sorted BAM file
+// returns the alignments sorted by user-defined Compare type
+template<typename Compare>
+std::vector<BamAlignment> GetSortedRegion(BamReader& reader,
+ const BamRegion& region,
+ const Compare& comp = Compare())
+{
+ // return empty container if unable to find region
+ if ( !reader.IsOpen() ) return std::vector<BamAlignment>();
+ if ( !reader.SetRegion(region) ) return std::vector<BamAlignment>();
+
+ // iterate through region, grabbing alignments
+ BamAlignment al;
+ std::vector<BamAlignment> results;
+ while ( reader.GetNextAlignmentCore(al) )
+ results.push_back(al);
+
+ // sort & return alignments
+ SortAlignments(results, comp);
+ return results;
+}
+
+template<typename Compare>
+std::vector<BamAlignment> GetSortedRegion(BamMultiReader& reader,
+ const BamRegion& region,
+ const Compare& comp = Compare())
+{
+ // return empty container if unable to find region
+ if ( !reader.HasOpenReaders() ) return std::vector<BamAlignment>();
+ if ( !reader.SetRegion(region) ) return std::vector<BamAlignment>();
+
+ // iterate through region, grabbing alignments
+ BamAlignment al;
+ std::vector<BamAlignment> results;
+ while ( reader.GetNextAlignmentCore(al) )
+ results.push_back(al);
+
+ // sort & return alignments
+ SortAlignments(results, comp);
+ return results;
+}
+
+} // namespace Algorithms
+} // namespace BamTools
+
+#endif // ALGORITHMS_SORT_H
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 18 March 2011 (DB)
// ---------------------------------------------------------------------------
// Provides merging functionality for BamMultiReader. At this point, supports
// sorting results by (refId, position) or by read name.
#include <api/BamAlignment.h>
#include <api/BamReader.h>
-#include <map>
-#include <queue>
+#include <api/algorithms/Sort.h>
+#include <deque>
+#include <functional>
+#include <set>
#include <string>
-#include <utility>
namespace BamTools {
namespace Internal {
-typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment;
+struct MergeItem {
-// generic MultiMerger interface
-class IBamMultiMerger {
+ // data members
+ BamReader* Reader;
+ BamAlignment* Alignment;
- public:
- IBamMultiMerger(void) { }
- virtual ~IBamMultiMerger(void) { }
+ // ctors & dtor
+ MergeItem(BamReader* reader = 0,
+ BamAlignment* alignment = 0)
+ : Reader(reader)
+ , Alignment(alignment)
+ { }
- public:
- virtual void Add(const ReaderAlignment& value) =0;
- virtual void Clear(void) =0;
- virtual const ReaderAlignment& First(void) const =0;
- virtual bool IsEmpty(void) const =0;
- virtual void Remove(BamReader* reader) =0;
- virtual int Size(void) const =0;
- virtual ReaderAlignment TakeFirst(void) =0;
-};
+ MergeItem(const MergeItem& other)
+ : Reader(other.Reader)
+ , Alignment(other.Alignment)
+ { }
-// IBamMultiMerger implementation - sorted on BamAlignment: (RefId, Position)
-class PositionMultiMerger : public IBamMultiMerger {
+ ~MergeItem(void) { }
+};
- public:
- PositionMultiMerger(void) : IBamMultiMerger() { }
- ~PositionMultiMerger(void) { }
+template<typename Compare>
+struct MergeItemSorter : public std::binary_function<MergeItem, MergeItem, bool> {
public:
- void Add(const ReaderAlignment& value);
- void Clear(void);
- const ReaderAlignment& First(void) const;
- bool IsEmpty(void) const;
- void Remove(BamReader* reader);
- int Size(void) const;
- ReaderAlignment TakeFirst(void);
+ MergeItemSorter(const Compare& comp = Compare())
+ : m_comp(comp)
+ { }
+
+ bool operator()(const MergeItem& lhs, const MergeItem& rhs) {
+ const BamAlignment& l = *lhs.Alignment;
+ const BamAlignment& r = *rhs.Alignment;
+ return m_comp(l,r);
+ }
private:
- typedef std::pair<int, int> KeyType;
- typedef ReaderAlignment ValueType;
- typedef std::pair<KeyType, ValueType> ElementType;
-
- typedef std::multimap<KeyType, ValueType> ContainerType;
- typedef ContainerType::iterator DataIterator;
- typedef ContainerType::const_iterator DataConstIterator;
-
- ContainerType m_data;
+ Compare m_comp;
};
-// IBamMultiMerger implementation - sorted on BamAlignment: Name
-class ReadNameMultiMerger : public IBamMultiMerger {
+// pure ABC so we can just work polymorphically with any specific merger implementation
+class IMultiMerger {
public:
- ReadNameMultiMerger(void) : IBamMultiMerger() { }
- ~ReadNameMultiMerger(void) { }
-
+ IMultiMerger(void) { }
+ virtual ~IMultiMerger(void) { }
public:
- void Add(const ReaderAlignment& value);
- void Clear(void);
- const ReaderAlignment& First(void) const;
- bool IsEmpty(void) const;
- void Remove(BamReader* reader);
- int Size(void) const;
- ReaderAlignment TakeFirst(void);
-
- private:
- typedef std::string KeyType;
- typedef ReaderAlignment ValueType;
- typedef std::pair<KeyType, ValueType> ElementType;
-
- typedef std::multimap<KeyType, ValueType> ContainerType;
- typedef ContainerType::iterator DataIterator;
- typedef ContainerType::const_iterator DataConstIterator;
-
- ContainerType m_data;
+ virtual void Add(MergeItem item) =0;
+ virtual void Clear(void) =0;
+ virtual const MergeItem& First(void) const =0;
+ virtual bool IsEmpty(void) const =0;
+ virtual void Remove(BamReader* reader) =0;
+ virtual int Size(void) const =0;
+ virtual MergeItem TakeFirst(void) =0;
};
-// IBamMultiMerger implementation - unsorted BAM file(s)
-class UnsortedMultiMerger : public IBamMultiMerger {
+// general merger
+template<typename Compare>
+class MultiMerger : public IMultiMerger {
public:
- UnsortedMultiMerger(void) : IBamMultiMerger() { }
- ~UnsortedMultiMerger(void) { }
+ typedef Compare CompareType;
+ typedef MergeItemSorter<CompareType> MergeType;
public:
- void Add(const ReaderAlignment& value);
+ explicit MultiMerger(const Compare& comp = Compare())
+ : IMultiMerger()
+ , m_data( MergeType(comp) )
+ { }
+ ~MultiMerger(void) { }
+
+ public:
+ void Add(MergeItem item);
void Clear(void);
- const ReaderAlignment& First(void) const;
+ const MergeItem& First(void) const;
bool IsEmpty(void) const;
void Remove(BamReader* reader);
int Size(void) const;
- ReaderAlignment TakeFirst(void);
+ MergeItem TakeFirst(void);
private:
- typedef ReaderAlignment ElementType;
- typedef std::vector<ReaderAlignment> ContainerType;
- typedef ContainerType::iterator DataIterator;
- typedef ContainerType::const_iterator DataConstIterator;
-
+ typedef MergeItem ValueType;
+ typedef std::multiset<ValueType, MergeType> ContainerType;
+ typedef typename ContainerType::iterator DataIterator;
+ typedef typename ContainerType::const_iterator DataConstIterator;
ContainerType m_data;
};
-// ------------------------------------------
-// PositionMultiMerger implementation
-
-inline void PositionMultiMerger::Add(const ReaderAlignment& value) {
- const KeyType key( value.second->RefID, value.second->Position );
- m_data.insert( ElementType(key, value) );
+template <typename Compare>
+inline void MultiMerger<Compare>::Add(MergeItem item) {
+ if ( CompareType::UsesCharData() )
+ item.Alignment->BuildCharData();
+ m_data.insert(item);
}
-inline void PositionMultiMerger::Clear(void) {
+template <typename Compare>
+inline void MultiMerger<Compare>::Clear(void) {
m_data.clear();
}
-inline const ReaderAlignment& PositionMultiMerger::First(void) const {
- const ElementType& entry = (*m_data.begin());
- return entry.second;
+template <typename Compare>
+inline const MergeItem& MultiMerger<Compare>::First(void) const {
+ const ValueType& entry = (*m_data.begin());
+ return entry;
}
-inline bool PositionMultiMerger::IsEmpty(void) const {
+template <typename Compare>
+inline bool MultiMerger<Compare>::IsEmpty(void) const {
return m_data.empty();
}
-
-inline void PositionMultiMerger::Remove(BamReader* reader) {
+template <typename Compare>
+inline void MultiMerger<Compare>::Remove(BamReader* reader) {
if ( reader == 0 ) return;
- const std::string filenameToRemove = reader->GetFilename();
+ const std::string& filenameToRemove = reader->GetFilename();
// iterate over readers in cache
DataIterator dataIter = m_data.begin();
DataIterator dataEnd = m_data.end();
for ( ; dataIter != dataEnd; ++dataIter ) {
- const ValueType& entry = (*dataIter).second;
- const BamReader* entryReader = entry.first;
- if ( entryReader == 0 ) continue;
+ const MergeItem& item = (*dataIter);
+ const BamReader* itemReader = item.Reader;
+ if ( itemReader == 0 ) continue;
// remove iterator on match
- if ( entryReader->GetFilename() == filenameToRemove ) {
+ if ( itemReader->GetFilename() == filenameToRemove ) {
m_data.erase(dataIter);
return;
}
}
}
-
-inline int PositionMultiMerger::Size(void) const {
+template <typename Compare>
+inline int MultiMerger<Compare>::Size(void) const {
return m_data.size();
}
-inline ReaderAlignment PositionMultiMerger::TakeFirst(void) {
- DataIterator first = m_data.begin();
- ReaderAlignment next = (*first).second;
- m_data.erase(first);
- return next;
-}
-
-// ------------------------------------------
-// ReadNameMultiMerger implementation
-
-inline void ReadNameMultiMerger::Add(const ReaderAlignment& value) {
- const KeyType key(value.second->Name);
- m_data.insert( ElementType(key, value) );
-}
-
-inline void ReadNameMultiMerger::Clear(void) {
- m_data.clear();
+template <typename Compare>
+inline MergeItem MultiMerger<Compare>::TakeFirst(void) {
+ DataIterator firstIter = m_data.begin();
+ MergeItem firstItem = (*firstIter);
+ m_data.erase(firstIter);
+ return firstItem;
}
-inline const ReaderAlignment& ReadNameMultiMerger::First(void) const {
- const ElementType& entry = (*m_data.begin());
- return entry.second;
-}
+// unsorted "merger"
+template<>
+class MultiMerger<Algorithms::Sort::Unsorted> : public IMultiMerger {
-inline bool ReadNameMultiMerger::IsEmpty(void) const {
- return m_data.empty();
-}
-
-inline void ReadNameMultiMerger::Remove(BamReader* reader) {
-
- if ( reader == 0 ) return;
- const std::string filenameToRemove = reader->GetFilename();
-
- // iterate over readers in cache
- DataIterator dataIter = m_data.begin();
- DataIterator dataEnd = m_data.end();
- for ( ; dataIter != dataEnd; ++dataIter ) {
- const ValueType& entry = (*dataIter).second;
- const BamReader* entryReader = entry.first;
- if ( entryReader == 0 ) continue;
-
- // remove iterator on match
- if ( entryReader->GetFilename() == filenameToRemove ) {
- m_data.erase(dataIter);
- return;
- }
- }
-
-}
-
-inline int ReadNameMultiMerger::Size(void) const {
- return m_data.size();
-}
+ public:
+ explicit MultiMerger(const Algorithms::Sort::Unsorted& comp = Algorithms::Sort::Unsorted())
+ : IMultiMerger()
+ { }
+ ~MultiMerger(void) { }
-inline ReaderAlignment ReadNameMultiMerger::TakeFirst(void) {
- DataIterator first = m_data.begin();
- ReaderAlignment next = (*first).second;
- m_data.erase(first);
- return next;
-}
+ public:
+ void Add(MergeItem item);
+ void Clear(void);
+ const MergeItem& First(void) const;
+ bool IsEmpty(void) const;
+ void Remove(BamReader* reader);
+ int Size(void) const;
+ MergeItem TakeFirst(void);
-// ------------------------------------------
-// UnsortedMultiMerger implementation
+ private:
+ typedef MergeItem ValueType;
+ typedef std::deque<ValueType> ContainerType;
+ typedef ContainerType::iterator DataIterator;
+ typedef ContainerType::const_iterator DataConstIterator;
+ ContainerType m_data;
+};
-inline void UnsortedMultiMerger::Add(const ReaderAlignment& value) {
- m_data.push_back(value);
+inline
+void MultiMerger<Algorithms::Sort::Unsorted>::Add(MergeItem item) {
+ m_data.push_back(item);
}
-inline void UnsortedMultiMerger::Clear(void) {
- for (size_t i = 0; i < m_data.size(); ++i )
- m_data.pop_back();
+inline
+void MultiMerger<Algorithms::Sort::Unsorted>::Clear(void) {
+ m_data.clear();
}
-inline const ReaderAlignment& UnsortedMultiMerger::First(void) const {
+inline
+const MergeItem& MultiMerger<Algorithms::Sort::Unsorted>::First(void) const {
return m_data.front();
}
-inline bool UnsortedMultiMerger::IsEmpty(void) const {
+inline
+bool MultiMerger<Algorithms::Sort::Unsorted>::IsEmpty(void) const {
return m_data.empty();
}
-inline void UnsortedMultiMerger::Remove(BamReader* reader) {
+inline
+void MultiMerger<Algorithms::Sort::Unsorted>::Remove(BamReader* reader) {
if ( reader == 0 ) return;
const std::string filenameToRemove = reader->GetFilename();
DataIterator dataIter = m_data.begin();
DataIterator dataEnd = m_data.end();
for ( ; dataIter != dataEnd; ++dataIter ) {
- const BamReader* entryReader = (*dataIter).first;
- if ( entryReader == 0 ) continue;
+ const MergeItem& item = (*dataIter);
+ const BamReader* itemReader = item.Reader;
+ if ( itemReader == 0 ) continue;
// remove iterator on match
- if ( entryReader->GetFilename() == filenameToRemove ) {
+ if ( itemReader->GetFilename() == filenameToRemove ) {
m_data.erase(dataIter);
return;
}
}
}
-inline int UnsortedMultiMerger::Size(void) const {
+inline
+int MultiMerger<Algorithms::Sort::Unsorted>::Size(void) const {
return m_data.size();
}
-inline ReaderAlignment UnsortedMultiMerger::TakeFirst(void) {
- ReaderAlignment first = m_data.front();
- m_data.erase( m_data.begin() );
- return first;
+inline
+MergeItem MultiMerger<Algorithms::Sort::Unsorted>::TakeFirst(void) {
+ MergeItem firstItem = m_data.front();
+ m_data.pop_front();
+ return firstItem;
}
} // namespace Internal
// ***************************************************************************
// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 5 April 2011 (DB)
+// Last modified: 3 October 2011 (DB)
// ---------------------------------------------------------------------------
// Functionality for simultaneously reading multiple BAM files
// *************************************************************************
#include <api/BamAlignment.h>
#include <api/BamMultiReader.h>
-#include <api/internal/BamMultiMerger_p.h>
+#include <api/SamConstants.h>
+#include <api/algorithms/Sort.h>
#include <api/internal/BamMultiReader_p.h>
using namespace BamTools;
using namespace BamTools::Internal;
// ctor
BamMultiReaderPrivate::BamMultiReaderPrivate(void)
- : m_alignments(0)
- , m_isCoreMode(false)
- , m_sortOrder(BamMultiReader::SortedByPosition)
+ : m_alignmentCache(0)
{ }
// dtor
BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
- // close all open BAM readers
+ // close all open BAM readers (& clean up cache)
Close();
-
- // clean up alignment cache
- delete m_alignments;
- m_alignments = 0;
}
// close all BAM files
}
// close requested BAM file
-void BamMultiReaderPrivate::CloseFile(const string& filename) {
+void BamMultiReaderPrivate::CloseFile(const string& filename) {
vector<string> filenames(1, filename);
CloseFiles(filenames);
}
if ( filename.empty() ) continue;
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// if reader matches requested filename
if ( reader->GetFilename() == filename ) {
- // remove reader/alignment from alignment cache
- m_alignments->Remove(reader);
+ // remove reader's entry from alignment cache
+ m_alignmentCache->Remove(reader);
- // close & delete reader
+ // clean up reader & its alignment
reader->Close();
delete reader;
reader = 0;
// delete reader's alignment entry
- BamAlignment* alignment = (*readerIter).second;
+ BamAlignment* alignment = item.Alignment;
delete alignment;
alignment = 0;
- // remove reader from container
+ // remove reader from reader list
m_readers.erase(readerIter);
// on match, just go on to next filename
- // (no need to keep looking and iterator is invalid now anyway)
+ // (no need to keep looking and item iterator is invalid now anyway)
break;
}
}
}
- // make sure alignment cache is cleared if all readers are now closed
- if ( m_readers.empty() && m_alignments != 0 )
- m_alignments->Clear();
+ // make sure alignment cache is cleaned up if all readers closed
+ if ( m_readers.empty() && m_alignmentCache ) {
+ m_alignmentCache->Clear();
+ delete m_alignmentCache;
+ m_alignmentCache = 0;
+ }
}
// creates index files for BAM files that don't have them
bool result = true;
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ vector<MergeItem>::iterator itemIter = m_readers.begin();
+ vector<MergeItem>::iterator itemEnd = m_readers.end();
+ for ( ; itemIter != itemEnd; ++itemIter ) {
+ MergeItem& item = (*itemIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// if reader doesn't have an index, create one
return result;
}
-IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const {
- switch ( m_sortOrder ) {
- case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger;
- case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger;
- case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger;
- default :
- cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl;
- return 0;
- }
-}
+IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const {
-const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const {
+ // fetch SamHeader
+ SamHeader header = GetHeader();
- string readGroup("");
- stringstream headerLineSs(headerLine);
- string part;
+ // if BAM files are sorted by position
+ if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE )
+ return new MultiMerger<Algorithms::Sort::ByPosition>();
- // parse @RG header line, looking for the ID: tag
- while( getline(headerLineSs, part, '\t') ) {
- stringstream partSs(part);
- string subtag;
- getline(partSs, subtag, ':');
- if ( subtag == "ID" ) {
- getline(partSs, readGroup, ':');
- break;
- }
- }
- return readGroup;
+ // if BAM files are sorted by read name
+ if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME )
+ return new MultiMerger<Algorithms::Sort::ByName>();
+
+ // otherwise "unknown" or "unsorted", use unsorted merger and just read in
+ return new MultiMerger<Algorithms::Sort::Unsorted>();
}
const vector<string> BamMultiReaderPrivate::Filenames(void) const {
filenames.reserve( m_readers.size() );
// iterate over readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ vector<MergeItem>::const_iterator itemIter = m_readers.begin();
+ vector<MergeItem>::const_iterator itemEnd = m_readers.end();
+ for ( ; itemIter != itemEnd; ++itemIter ) {
+ const MergeItem& item = (*itemIter);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// store filename if not empty
- const string filename = reader->GetFilename();
+ const string& filename = reader->GetFilename();
if ( !filename.empty() )
- filenames.push_back( reader->GetFilename() );
+ filenames.push_back(filename);
}
// return result
}
SamHeader BamMultiReaderPrivate::GetHeader(void) const {
- string text = GetHeaderText();
+ const string& text = GetHeaderText();
return SamHeader(text);
}
// makes a virtual, unified header for all the bam files in the multireader
string BamMultiReaderPrivate::GetHeaderText(void) const {
- // TODO: merge SamHeader objects instead of parsing string data (again)
-
- // if only one reader is open
- if ( m_readers.size() == 1 ) {
-
- // just return reader's header text
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
- if ( reader ) return reader->GetHeaderText();
+ // N.B. - right now, simply copies all header data from first BAM,
+ // and then appends RG's from other BAM files
+ // TODO: make this more intelligent wrt other header lines/fields
- // invalid reader
- return string();
- }
+ // if no readers open
+ const size_t numReaders = m_readers.size();
+ if ( numReaders == 0 ) return string();
- string mergedHeader("");
- map<string, bool> readGroups;
+ // retrieve first reader's header
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* reader = firstItem.Reader;
+ if ( reader == 0 ) return string();
+ SamHeader mergedHeader = reader->GetHeader();
- // foreach extraction entry (each BAM file)
- vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerIter = readerBegin;
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ // iterate over any remaining readers (skipping the first)
+ for ( size_t i = 1; i < numReaders; ++i ) {
+ const MergeItem& item = m_readers.at(i);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
- // get header from reader
- string headerText = reader->GetHeaderText();
- if ( headerText.empty() ) continue;
-
- // store header text in lines
- map<string, bool> currentFileReadGroups;
- const vector<string> lines = SplitHeaderText(headerText);
-
- // iterate over header lines
- vector<string>::const_iterator linesIter = lines.begin();
- vector<string>::const_iterator linesEnd = lines.end();
- for ( ; linesIter != linesEnd; ++linesIter ) {
-
- // get next line from header, skip if empty
- const string headerLine = (*linesIter);
- if ( headerLine.empty() ) continue;
-
- // if first file, save HD & SQ entries
- // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ?
- if ( readerIter == readerBegin ) {
- if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
- mergedHeader.append(headerLine.c_str());
- mergedHeader.append(1, '\n');
- }
- }
+ // retrieve current reader's header
+ const SamHeader currentHeader = reader->GetHeader();
- // (for all files) append RG entries if they are unique
- if ( headerLine.find("@RG") == 0 ) {
-
- // extract read group name from line
- const string readGroup = ExtractReadGroup(headerLine);
-
- // make sure not to duplicate @RG entries
- if ( readGroups.find(readGroup) == readGroups.end() ) {
- mergedHeader.append(headerLine.c_str() );
- mergedHeader.append(1, '\n');
- readGroups[readGroup] = true;
- currentFileReadGroups[readGroup] = true;
- } else {
- // warn iff we are reading one file and discover duplicated @RG tags in the header
- // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
- if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) {
- cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup
- << " entry in header of " << reader->GetFilename() << endl;
- }
- }
- }
- }
+ // append current reader's RG entries to merged header
+ // N.B. - SamReadGroupDictionary handles duplicate-checking
+ mergedHeader.ReadGroups.Add(currentHeader.ReadGroups);
+
+ // TODO: merge anything else??
}
- // return merged header text
- return mergedHeader;
+ // return stringified header
+ return mergedHeader.ToString();
}
// get next alignment among all files
bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
- m_isCoreMode = false;
- return LoadNextAlignment(al);
+ return PopNextCachedAlignment(al, true);
}
// get next alignment among all files without parsing character data from alignments
bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
- m_isCoreMode = true;
- return LoadNextAlignment(al);
+ return PopNextCachedAlignment(al, false);
}
// ---------------------------------------------------------------------------------------
return 0;
// return reference count from first reader
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
if ( reader ) return reader->GetReferenceCount();
// invalid reader
return RefVector();
// return reference data from first BamReader
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
if ( reader ) return reader->GetReferenceData();
// invalid reader
return -1;
// return reference ID from first BamReader
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
if ( reader ) return reader->GetReferenceID(refName);
// invalid reader
}
// ---------------------------------------------------------------------------------------
-// checks if any readers still have alignments
-bool BamMultiReaderPrivate::HasAlignmentData(void) const {
- if ( m_alignments == 0 )
- return false;
- return !m_alignments->IsEmpty();
-}
-
// returns true if all readers have index data available
// this is useful to indicate whether Jump() or SetRegion() are possible
bool BamMultiReaderPrivate::HasIndexes(void) const {
bool result = true;
// iterate over readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// see if current reader has index data
bool BamMultiReaderPrivate::HasOpenReaders(void) {
// iterate over readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// return true whenever an open reader is found
// UpdateAlignments(), and continue.
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// attempt jump() on each
}
}
- // update alignment cache & return success
- UpdateAlignmentCache();
- return true;
-}
-
-bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al) {
-
- // bail out if no more data to process
- if ( !HasAlignmentData() )
- return false;
-
- // "pop" next alignment and reader
- ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst();
- BamReader* reader = nextReaderAlignment.first;
- BamAlignment* alignment = nextReaderAlignment.second;
-
- // store cached alignment into destination parameter (by copy)
- al = *alignment;
-
- // peek to next alignment & store in cache
- SaveNextAlignment(reader, alignment);
-
- // return success
- return true;
+ // returns status of cache update
+ return UpdateAlignmentCache();
}
// locate (& load) index files for BAM readers that don't already have one loaded
bool result = true;
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// if reader has no index, try to locate one
// opens BAM files
bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
- // create alignment cache if neccessary
- if ( m_alignments == 0 ) {
- m_alignments = CreateMergerForCurrentSortOrder();
- if ( m_alignments == 0 ) return false;
- }
+ bool openedOk = true;
+
+ // put all current readers back at beginning
+ openedOk &= Rewind();
// iterate over filenames
vector<string>::const_iterator filenameIter = filenames.begin();
const string& filename = (*filenameIter);
if ( filename.empty() ) continue;
- // attempt to open BamReader on filename
- BamReader* reader = OpenReader(filename);
- if ( reader == 0 ) continue;
+ // attempt to open BamReader
+ BamReader* reader = new BamReader;
+ const bool readerOpened = reader->Open(filename);
- // store reader with new alignment
- m_readers.push_back( make_pair(reader, new BamAlignment) );
- }
+ // if opened OK, store it
+ if ( readerOpened )
+ m_readers.push_back( MergeItem(reader, new BamAlignment) );
+
+ // otherwise clean up invalid reader
+ else delete reader;
- // validate & rewind any opened readers, also refreshes alignment cache
- if ( !m_readers.empty() ) {
- ValidateReaders();
- Rewind();
+ // update method return status
+ openedOk &= readerOpened;
}
+ // if more than one reader open, check for consistency
+ if ( m_readers.size() > 1 )
+ openedOk &= ValidateReaders();
+
+ // update alignment cache
+ openedOk &= UpdateAlignmentCache();
+
// return success
- return true;
+ return openedOk;
}
bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
// first reader without an index?
// make sure same number of index filenames as readers
- if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() )
+ if ( m_readers.size() != indexFilenames.size() )
return false;
// init result flag
// iterate over BamReaders
vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
// open index filename on reader
if ( reader ) {
break;
}
- // TODO: validation ??
+ // TODO: any validation needed here??
// return success/fail
return result;
}
-BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) {
+bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
- // create new BamReader
- BamReader* reader = new BamReader;
-
- // if reader opens OK
- if ( reader->Open(filename) ) {
+ // skip if no alignments available
+ if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() )
+ return false;
- // attempt to read first alignment (sanity check)
- // if ok, then return BamReader pointer
- BamAlignment al;
- if ( reader->GetNextAlignmentCore(al) )
- return reader;
+ // pop next merge item entry from cache
+ MergeItem item = m_alignmentCache->TakeFirst();
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
+ if ( reader == 0 || alignment == 0 )
+ return false;
- // could not read alignment
- else {
- cerr << "BamMultiReader WARNING: Could not read first alignment from "
- << filename << ", ignoring file" << endl;
- }
+ // set char data if requested
+ if ( needCharData ) {
+ alignment->BuildCharData();
+ alignment->Filename = reader->GetFilename();
}
- // reader could not open
- else {
- cerr << "BamMultiReader WARNING: Could not open "
- << filename << ", ignoring file" << endl;
- }
+ // store cached alignment into destination parameter (by copy)
+ al = *alignment;
- // if we get here, there was a problem with this BAM file (opening or reading)
- // clean up memory allocation & return null pointer
- delete reader;
- return 0;
-}
+ // load next alignment from reader & store in cache
+ SaveNextAlignment(reader, alignment);
-// print associated filenames to stdout
-void BamMultiReaderPrivate::PrintFilenames(void) const {
- const vector<string>& filenames = Filenames();
- vector<string>::const_iterator filenameIter = filenames.begin();
- vector<string>::const_iterator filenameEnd = filenames.end();
- for ( ; filenameIter != filenameEnd; ++filenameIter )
- cout << (*filenameIter) << endl;
+ // return success
+ return true;
}
// returns BAM file pointers to beginning of alignment data & resets alignment cache
bool BamMultiReaderPrivate::Rewind(void) {
- // clear out alignment cache
- m_alignments->Clear();
-
// attempt to rewind files
if ( !RewindReaders() ) {
cerr << "BamMultiReader ERROR: could not rewind file(s) successfully";
return false;
}
- // reset cache & return success
- UpdateAlignmentCache();
- return true;
+ // return status of cache update
+ return UpdateAlignmentCache();
}
// returns BAM file pointers to beginning of alignment data
bool result = true;
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// attempt rewind on BamReader
void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
- // must be in core mode && NOT sorting by read name to call GNACore()
- if ( m_isCoreMode && m_sortOrder != BamMultiReader::SortedByReadName ) {
- if ( reader->GetNextAlignmentCore(*alignment) )
- m_alignments->Add( make_pair(reader, alignment) );
- }
-
- // not in core mode and/or sorting by readname, must call GNA()
- else {
- if ( reader->GetNextAlignment(*alignment) )
- m_alignments->Add( make_pair(reader, alignment) );
- }
+ // if can read alignment from reader, store in cache
+ // N.B. - lazy building of alignment's char data,
+ // only populated on demand by sorting merger or client call to GetNextAlignment()
+ if ( reader->GetNextAlignmentCore(*alignment) )
+ m_alignmentCache->Add(MergeItem(reader, alignment));
}
// sets the index caching mode on the readers
void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) {
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// set reader's index cache mode
// UpdateAlignments(), and continue.
// iterate over alignments
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// attempt to set BamReader's region of interest
}
}
- // update alignment cache & return success
- UpdateAlignmentCache();
- return true;
-}
-
-void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) {
-
- // skip if no change needed
- if ( m_sortOrder == order ) return;
-
- // set new sort order
- m_sortOrder = order;
-
- // create new alignment cache based on sort order
- IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder();
- if ( newAlignmentCache == 0 ) return; // print error?
-
- // copy old cache contents to new cache
- while ( m_alignments->Size() > 0 ) {
- ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops'
- newAlignmentCache->Add(value);
- }
-
- // remove old cache structure & point to new cache
- delete m_alignments;
- m_alignments = newAlignmentCache;
-}
-
-// splits the entire header into a list of strings
-const vector<string> BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const {
-
- stringstream header(headerText);
- string item;
-
- vector<string> lines;
- while ( getline(header, item) )
- lines.push_back(item);
- return lines;
+ // return status of cache update
+ return UpdateAlignmentCache();
}
// updates our alignment cache
-void BamMultiReaderPrivate::UpdateAlignmentCache(void) {
-
- // skip if invalid alignment cache
- if ( m_alignments == 0 ) return;
-
- // clear the cache
- m_alignments->Clear();
+bool BamMultiReaderPrivate::UpdateAlignmentCache(void) {
+
+ // create alignment cache if not created yet
+ if ( m_alignmentCache == 0 ) {
+ m_alignmentCache = CreateAlignmentCache();
+ if ( m_alignmentCache == 0 ) {
+ // set error string
+ return false;
+ }
+ }
- // seed cache with fully-populated alignments
- // further updates will fill with full/core-only as requested
- m_isCoreMode = false;
+ // clear any prior cache data
+ m_alignmentCache->Clear();
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
- BamAlignment* alignment = (*readerIter).second;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
if ( reader == 0 || alignment == 0 ) continue;
// save next alignment from each reader in cache
SaveNextAlignment(reader, alignment);
}
+
+ // if we get here, ok
+ return true;
}
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered. If these checks fail the operation of
// the multireader is undefined, so we force program exit.
-void BamMultiReaderPrivate::ValidateReaders(void) const {
+bool BamMultiReaderPrivate::ValidateReaders(void) const {
+
+ // skip if no readers opened
+ if ( m_readers.empty() )
+ return true;
- // retrieve first reader data
- const BamReader* firstReader = m_readers.front().first;
- if ( firstReader == 0 ) return;
- const RefVector firstReaderRefData = firstReader->GetReferenceData();
+ // retrieve first reader
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* firstReader = firstItem.Reader;
+ if ( firstReader == 0 ) return false;
+
+ // retrieve first reader's header data
+ const SamHeader& firstReaderHeader = firstReader->GetHeader();
+ const string& firstReaderSortOrder = firstReaderHeader.SortOrder;
+
+ // retrieve first reader's reference data
+ const RefVector& firstReaderRefData = firstReader->GetReferenceData();
const int firstReaderRefCount = firstReader->GetReferenceCount();
const int firstReaderRefSize = firstReaderRefData.size();
// iterate over all readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
-
- // get current reader data
- BamReader* reader = (*readerIter).first;
+ const MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
+
+ // get current reader's header data
+ const SamHeader& currentReaderHeader = reader->GetHeader();
+ const string& currentReaderSortOrder = currentReaderHeader.SortOrder;
+
+ // check compatible sort order
+ if ( currentReaderSortOrder != firstReaderSortOrder ) {
+ // error string
+ cerr << "BamMultiReader ERROR: mismatched sort order in " << reader->GetFilename()
+ << ", expected " << firstReaderSortOrder
+ << ", but found " << currentReaderSortOrder << endl;
+ return false;
+ }
+
+ // get current reader's reference data
const RefVector currentReaderRefData = reader->GetReferenceData();
const int currentReaderRefCount = reader->GetReferenceCount();
const int currentReaderRefSize = currentReaderRefData.size();
- // init container iterators
+ // init reference data iterators
RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename()
<< " expected " << firstReaderRefCount
<< " reference sequences but only found " << currentReaderRefCount << endl;
- exit(1);
+ return false;
}
// this will be ok; we just checked above that we have identically-sized sets of references
cerr << entry.RefName << " " << entry.RefLength << endl;
}
- exit(1);
+ return false;
}
// update iterators
++currentRefIter;
}
}
+
+ // if we get here, everything checks out
+ return true;
}
// ***************************************************************************
// BamMultiReader_p.h (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 13 March 2011 (DB)
+// Last modified: 3 October 2011 (DB)
// ---------------------------------------------------------------------------
// Functionality for simultaneously reading multiple BAM files
// *************************************************************************
#include <api/SamHeader.h>
#include <api/BamMultiReader.h>
+#include <api/internal/BamMultiMerger_p.h>
#include <string>
#include <vector>
namespace BamTools {
namespace Internal {
-class IBamMultiMerger;
-
class BamMultiReaderPrivate {
// constructor / destructor
bool Jump(int refID, int position = 0);
bool Open(const std::vector<std::string>& filenames);
bool OpenFile(const std::string& filename);
- void PrintFilenames(void) const;
bool Rewind(void);
bool SetRegion(const BamRegion& region);
bool GetNextAlignment(BamAlignment& al);
bool GetNextAlignmentCore(BamAlignment& al);
bool HasOpenReaders(void);
- void SetSortOrder(const BamMultiReader::SortOrder& order);
// access auxiliary data
SamHeader GetHeader(void) const;
// 'internal' methods
public:
- IBamMultiMerger* CreateMergerForCurrentSortOrder(void) const;
- const std::string ExtractReadGroup(const std::string& headerLine) const;
- bool HasAlignmentData(void) const;
- bool LoadNextAlignment(BamAlignment& al);
- BamTools::BamReader* OpenReader(const std::string& filename);
+ IMultiMerger* CreateAlignmentCache(void) const;
+ bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData);
bool RewindReaders(void);
- void SaveNextAlignment(BamTools::BamReader* reader, BamTools::BamAlignment* alignment);
- const std::vector<std::string> SplitHeaderText(const std::string& headerText) const;
- void UpdateAlignmentCache(void);
- void ValidateReaders(void) const;
+ void SaveNextAlignment(BamReader* reader, BamAlignment* alignment);
+ bool UpdateAlignmentCache(void);
+ bool ValidateReaders(void) const;
// data members
public:
- typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment;
- std::vector<ReaderAlignment> m_readers;
-
- IBamMultiMerger* m_alignments;
- bool m_isCoreMode;
- BamMultiReader::SortOrder m_sortOrder;
+ std::vector<MergeItem> m_readers;
+ IMultiMerger* m_alignmentCache;
};
} // namespace Internal
alignments.push_back(al);
}
+ using namespace BamTools::Algorithms;
+
cerr << endl
<< "------------------------------" << endl
<< "Unsorted Alignments" << endl
<< "------------------------------" << endl
<< endl;
- std::stable_sort(alignments.begin(), alignments.end(), Algorithms::Unsorted());
+ std::stable_sort(alignments.begin(), alignments.end(), Sort::Unsorted());
printAlignments(alignments);
cerr << "------------------------------" << endl
<< endl;
<< "Sorted Alignments (by name)" << endl
<< "------------------------------" << endl
<< endl;
- std::sort(alignments.begin(), alignments.end(), Algorithms::SortByName<>());
+// std::sort(alignments.begin(), alignments.end(), Sort::ByName());
+ Algorithms::SortAlignments(alignments, Sort::ByName());
printAlignments(alignments);
cerr << endl
<< "------------------------------" << endl
<< "Sorted Alignments (by tag Aq)" << endl
<< "------------------------------" << endl
<< endl;
- std::sort(alignments.begin(), alignments.end(), Algorithms::SortByTag<int>("Aq"));
+// std::sort(alignments.begin(), alignments.end(), Sort::ByTag<int>("Aq"));
+ Algorithms::SortAlignments(alignments, Sort::ByTag<int>("Aq"));
printAlignments(alignments);
cerr << endl
<< "------------------------------" << endl
<< "Sorted Alignments (by tag Aq) desc" << endl
<< "------------------------------" << endl
<< endl;
- std::sort(alignments.begin(), alignments.end(), Algorithms::SortByTag<int, std::greater>("Aq"));
+ std::sort(alignments.begin(), alignments.end(), Sort::ByTag<int>("Aq", Sort::DescendingOrder));
printAlignments(alignments);
cerr << endl
<< "------------------------------" << endl
reader.LocateIndexes();
// if index data available for all BAM files, we can use SetRegion
- if ( reader.IsIndexLoaded() ) {
+ if ( reader.HasIndexes() ) {
vector<BamAlignment> alignments;
+ using namespace BamTools::Algorithms;
- alignments = Algorithms::SortReaderRegion(reader, region, Algorithms::SortByName<>() );
+ alignments = GetSortedRegion(reader, region, Sort::ByName() );
printAlignments(alignments);
cerr << "################################" << endl;
- alignments = Algorithms::SortReaderRegion(reader, region, Algorithms::SortByTag<int>("Aq"));
+ alignments = GetSortedRegion(reader, region, Sort::ByTag<int>("Aq"));
printAlignments(alignments);
// // attempt to set region on reader
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 14 June 2011 (DB)
+// Last modified: 3 October 2011 (DB)
// ---------------------------------------------------------------------------
// Sorts an input BAM file
// ***************************************************************************
vector<BamAlignment> buffer;
buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
bool bufferFull = false;
-
+
// if sorting by name, we need to generate full char data
// so can't use GetNextAlignmentCore()
if ( m_settings->IsSortingByName ) {
return false;
}
- // set sort order for merge
- if ( m_settings->IsSortingByName )
- multiReader.SetSortOrder(BamMultiReader::SortedByReadName);
- else
- multiReader.SetSortOrder(BamMultiReader::SortedByPosition);
-
// open writer for our completely sorted output BAM file
BamWriter mergedWriter;
if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) {
while ( multiReader.GetNextAlignmentCore(al) )
mergedWriter.SaveAlignment(al);
- // close readers
+ // close files
multiReader.Close();
mergedWriter.Close();
remove(tempFilename.c_str());
}
+ // return success
return true;
}