From 83255b02911cc3e267386e20bcb8747478b4c239 Mon Sep 17 00:00:00 2001 From: derek Date: Thu, 23 Dec 2010 21:14:49 -0500 Subject: [PATCH] Implemented proper -byname sorting (finally). * BamMultiReader used to merge the "next" alignment based on (refID, position). Extracted this and generalized to support merging on either position OR alignment name. --- src/api/BamMultiReader.cpp | 402 ++---------------- src/api/BamMultiReader.h | 65 ++- src/api/CMakeLists.txt | 2 + src/api/internal/BamMultiMerger_p.h | 158 +++++++ src/api/internal/BamMultiReader_p.cpp | 565 ++++++++++++++++++++++++++ src/api/internal/BamMultiReader_p.h | 95 +++++ 6 files changed, 879 insertions(+), 408 deletions(-) create mode 100644 src/api/internal/BamMultiMerger_p.h create mode 100644 src/api/internal/BamMultiReader_p.cpp create mode 100644 src/api/internal/BamMultiReader_p.h diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index 3331c09..e5af282 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 23 December 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -18,13 +18,9 @@ #include #include +#include using namespace BamTools; -#include -#include -#include -#include -#include #include #include using namespace std; @@ -33,418 +29,92 @@ using namespace std; // BamMultiReader implementation // ----------------------------------------------------- -// constructor BamMultiReader::BamMultiReader(void) - : CurrentRefID(0) - , CurrentLeft(0) + : d(new Internal::BamMultiReaderPrivate) { } -// destructor BamMultiReader::~BamMultiReader(void) { - Close(); + delete d; + d = 0; } -// close the BAM files void BamMultiReader::Close(void) { - - // close all BAM readers and clean up pointers - vector >::iterator readerIter = readers.begin(); - vector >::iterator readerEnd = readers.end(); - for ( ; readerIter != readerEnd; ++readerIter) { - - BamReader* reader = (*readerIter).first; - BamAlignment* alignment = (*readerIter).second; - - // close the reader - if ( reader) reader->Close(); - - // delete reader pointer - delete reader; - reader = 0; - - // delete alignment pointer - delete alignment; - alignment = 0; - } - - // clear out the container - readers.clear(); + d->Close(); } -// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail bool BamMultiReader::CreateIndexes(bool useStandardIndex) { - bool result = true; - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->CreateIndex(useStandardIndex); - } - return result; + return d->CreateIndexes(useStandardIndex); } -// sets the index caching mode on the readers void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - reader->SetIndexCacheMode(mode); - } -} - -// for debugging -void BamMultiReader::DumpAlignmentIndex(void) { - for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { - cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; - } + d->SetIndexCacheMode(mode); } -// makes a virtual, unified header for all the bam files in the multireader const string BamMultiReader::GetHeaderText(void) const { - - string mergedHeader = ""; - map readGroups; - - // foreach extraction entry (each BAM file) - for (vector >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { - - BamReader* reader = rs->first; - string headerText = reader->GetHeaderText(); - if ( headerText.empty() ) continue; - - map currentFileReadGroups; - stringstream header(headerText); - vector lines; - string item; - while (getline(header, item)) - lines.push_back(item); - - for (vector::const_iterator it = lines.begin(); it != lines.end(); ++it) { - - // get next line from header, skip if empty - string headerLine = *it; - if ( headerLine.empty() ) { continue; } - - // if first file, save HD & SQ entries - if ( rs == readers.begin() ) { - if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { - mergedHeader.append(headerLine.c_str()); - mergedHeader.append(1, '\n'); - } - } - - // (for all files) append RG entries if they are unique - if ( headerLine.find("@RG") == 0 ) { - stringstream headerLineSs(headerLine); - string part, readGroupPart, readGroup; - while(std::getline(headerLineSs, part, '\t')) { - stringstream partSs(part); - string subtag; - std::getline(partSs, subtag, ':'); - if (subtag == "ID") { - std::getline(partSs, readGroup, ':'); - break; - } - } - if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries - mergedHeader.append(headerLine.c_str() ); - mergedHeader.append(1, '\n'); - readGroups[readGroup] = true; - currentFileReadGroups[readGroup] = true; - } else { - // warn iff we are reading one file and discover duplicated @RG tags in the header - // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags - if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { - cerr << "WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; - } - } - } - } - } - - // return merged header text - return mergedHeader; + return d->GetHeaderText(); } -// get next alignment among all files bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignment(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - + return d->GetNextAlignment(nextAlignment); } -// get next alignment among all files without parsing character data from alignments bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - //memcpy(&nextAlignment, alignment, sizeof(BamAlignment)); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignmentCore(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - + return d->GetNextAlignmentCore(nextAlignment); } -// --------------------------------------------------------------------------------------- -// -// NB: The following GetReferenceX() functions assume that we have identical -// references for all BAM files. We enforce this by invoking the above -// validation function (ValidateReaders) to verify that our reference data -// is the same across all files on Open, so we will not encounter a situation -// in which there is a mismatch and we are still live. -// -// --------------------------------------------------------------------------------------- - -// returns the number of reference sequences const int BamMultiReader::GetReferenceCount(void) const { - return readers.front().first->GetReferenceCount(); + return d->GetReferenceCount(); } -// returns vector of reference objects const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { - return readers.front().first->GetReferenceData(); + return d->GetReferenceData(); } -// returns refID from reference name const int BamMultiReader::GetReferenceID(const string& refName) const { - return readers.front().first->GetReferenceID(refName); + return d->GetReferenceID(refName); } -// --------------------------------------------------------------------------------------- - -// checks if any readers still have alignments bool BamMultiReader::HasOpenReaders() { - return alignments.size() > 0; + return d->HasOpenReaders(); } -// returns whether underlying BAM readers ALL have an index loaded -// this is useful to indicate whether Jump() or SetRegion() are possible bool BamMultiReader::IsIndexLoaded(void) const { - bool ok = true; - vector >::const_iterator readerIter = readers.begin(); - vector >::const_iterator readerEnd = readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; - if ( reader ) ok &= reader->IsIndexLoaded(); - } - return ok; + return d->IsIndexLoaded(); } -// jumps to specified region(refID, leftBound) in BAM files, returns success/fail bool BamMultiReader::Jump(int refID, int position) { - - //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { - CurrentRefID = refID; - CurrentLeft = position; - - bool result = true; - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Jump(refID, position); - if (!result) { - cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; - exit(1); - } - } - if (result) UpdateAlignments(); - return result; + return d->Jump(refID, position); } -// opens BAM files -bool BamMultiReader::Open(const vector& filenames, bool openIndexes, bool coreMode, bool preferStandardIndex) { - - // for filename in filenames - fileNames = filenames; // save filenames in our multireader - for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { - - const string filename = *it; - BamReader* reader = new BamReader; - - bool openedOK = true; - openedOK = reader->Open(filename, "", openIndexes, preferStandardIndex); - - // if file opened ok, check that it can be read - if ( openedOK ) { - - bool fileOK = true; - BamAlignment* alignment = new BamAlignment; - fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) ); - - if (fileOK) { - readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { - cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; - // if only file available & could not be read, return failure - if ( filenames.size() == 1 ) return false; - } - } - - // TODO; any further error handling when openedOK is false ?? - else - return false; - } - - // files opened ok, at least one alignment could be read, - // now need to check that all files use same reference data - ValidateReaders(); - return true; +bool BamMultiReader::Open(const vector& filenames, + bool openIndexes, + bool coreMode, + bool preferStandardIndex) +{ + return d->Open(filenames, openIndexes, coreMode, preferStandardIndex); } -void BamMultiReader::PrintFilenames(void) { - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - cout << reader->GetFilename() << endl; - } +void BamMultiReader::PrintFilenames(void) const { + d->PrintFilenames(); } -// returns BAM file pointers to beginning of alignment data -bool BamMultiReader::Rewind(void) { - bool result = true; - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Rewind(); - } - return result; +bool BamMultiReader::Rewind(void) { + return d->Rewind(); } -bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { +bool BamMultiReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) +{ BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); - return SetRegion(region); + return d->SetRegion(region); } bool BamMultiReader::SetRegion(const BamRegion& region) { - - Region = region; - - // NB: While it may make sense to track readers in which we can - // successfully SetRegion, In practice a failure of SetRegion means "no - // alignments here." It makes sense to simply accept the failure, - // UpdateAlignments(), and continue. - - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - if (!it->first->SetRegion(region)) { - cerr << "ERROR: could not jump " << it->first->GetFilename() << " to " - << region.LeftRefID << ":" << region.LeftPosition - << ".." << region.RightRefID << ":" << region.RightPosition << endl; - } - } - - UpdateAlignments(); - return true; -} - -void BamMultiReader::UpdateAlignments(void) { - // Update Alignments - alignments.clear(); - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* br = it->first; - BamAlignment* ba = it->second; - if (br->GetNextAlignment(*ba)) { - alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), - make_pair(br, ba))); - } else { - // assume BamReader end of region / EOF - } - } -} - -// updates the reference id stored in the BamMultiReader -// to reflect the current state of the readers -void BamMultiReader::UpdateReferenceID(void) { - // the alignments are sorted by position, so the first alignment will always have the lowest reference ID - if (alignments.begin()->second.second->RefID != CurrentRefID) { - // get the next reference id - // while there aren't any readers at the next ref id - // increment the ref id - int nextRefID = CurrentRefID; - while (alignments.begin()->second.second->RefID != nextRefID) { - ++nextRefID; - } - //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; - CurrentRefID = nextRefID; - } + return d->SetRegion(region); } -// ValidateReaders checks that all the readers point to BAM files representing -// alignments against the same set of reference sequences, and that the -// sequences are identically ordered. If these checks fail the operation of -// the multireader is undefined, so we force program exit. -void BamMultiReader::ValidateReaders(void) const { - int firstRefCount = readers.front().first->GetReferenceCount(); - BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); - for (vector >::const_iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - BamTools::RefVector currentRefData = reader->GetReferenceData(); - BamTools::RefVector::const_iterator f = firstRefData.begin(); - BamTools::RefVector::const_iterator c = currentRefData.begin(); - if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { - cerr << "ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstRefCount - << " reference sequences but only found " << reader->GetReferenceCount() << endl; - exit(1); - } - // this will be ok; we just checked above that we have identically-sized sets of references - // here we simply check if they are all, in fact, equal in content - while (f != firstRefData.end()) { - if (f->RefName != c->RefName || f->RefLength != c->RefLength) { - cerr << "ERROR: mismatched references found in " << reader->GetFilename() - << " expected: " << endl; - for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - cerr << "but found: " << endl; - for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - exit(1); - } - ++f; ++c; - } - } +void BamMultiReader::SetSortOrder(const SortOrder& order) { + d->SetSortOrder(order); } diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index d30a0d3..6de7373 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 23 December 2010 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // *************************************************************************** @@ -20,8 +20,9 @@ namespace BamTools { -// index mapping reference/position pairings to bamreaders and their alignments -typedef std::multimap, std::pair > AlignmentIndex; +namespace Internal { + class BamMultiReaderPrivate; +} // namespace Internal class API_EXPORT BamMultiReader { @@ -33,58 +34,56 @@ class API_EXPORT BamMultiReader { // public interface public: - // positioning - int CurrentRefID; - int CurrentLeft; - - // region under analysis, specified using SetRegion - BamRegion Region; - // ---------------------- // BAM file operations // ---------------------- // close BAM files void Close(void); - // opens BAM files (and optional BAM index files, if provided) // @openIndexes - triggers index opening, useful for suppressing // error messages during merging of files in which we may not have // indexes. // @coreMode - setup our first alignments using GetNextAlignmentCore(); // also useful for merging - // @preferStandardIndex - look for standard BAM index ".bai" first. If false, - // will look for BamTools index ".bti". - bool Open(const std::vector& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false); - + // @preferStandardIndex - look for standard BAM index ".bai" first. If false, + // will look for BamTools index ".bti". + bool Open(const std::vector& filenames, + bool openIndexes = true, + bool coreMode = false, + bool preferStandardIndex = false); // returns whether underlying BAM readers ALL have an index loaded // this is useful to indicate whether Jump() or SetRegion() are possible bool IsIndexLoaded(void) const; - // performs random-access jump to reference, position bool Jump(int refID, int position = 0); - + // list files associated with this multireader + void PrintFilenames(void) const; // sets the target region bool SetRegion(const BamRegion& region); - bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above - + bool SetRegion(const int& leftRefID, + const int& leftBound, + const int& rightRefID, + const int& rightBound); // returns file pointers to beginning of alignments bool Rewind(void); // ---------------------- // access alignment data // ---------------------- - // updates the reference id marker to match the lower limit of our readers - void UpdateReferenceID(void); // retrieves next available alignment (returns success/fail) from all files - bool GetNextAlignment(BamAlignment&); + bool GetNextAlignment(BamAlignment& alignment); // retrieves next available alignment (returns success/fail) from all files // and populates the support data with information about the alignment // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT - bool GetNextAlignmentCore(BamAlignment&); + bool GetNextAlignmentCore(BamAlignment& alignment); // ... should this be private? bool HasOpenReaders(void); + // set sort order for merging BAM files (i.e. which alignment from the files is 'next'?) + // default behavior is to sort by position, use this method to handle BAMs sorted by read name + enum SortOrder { SortedByPosition = 0, SortedByReadName}; + void SetSortOrder(const SortOrder& order); // ---------------------- // access auxiliary data @@ -98,8 +97,6 @@ class API_EXPORT BamMultiReader { const BamTools::RefVector GetReferenceData(void) const; // returns reference id (used for BamMultiReader::Jump()) for the given reference name const int GetReferenceID(const std::string& refName) const; - // validates that we have a congruent set of BAM files that are aligned against the same reference sequences - void ValidateReaders() const; // ---------------------- // BAM index operations @@ -107,28 +104,12 @@ class API_EXPORT BamMultiReader { // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") bool CreateIndexes(bool useStandardIndex = true); - // sets the index caching mode for the readers void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - //const int GetReferenceID(const string& refName) const; - - // utility - void PrintFilenames(void); - void DumpAlignmentIndex(void); - void UpdateAlignments(void); // updates our alignment cache - // private implementation private: - - // the set of readers and alignments which we operate on, maintained throughout the life of this class - std::vector > readers; - - // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment - // when a reader reaches EOF, its entry is removed from this index - AlignmentIndex alignments; - - std::vector fileNames; + Internal::BamMultiReaderPrivate* d; }; } // namespace BamTools diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index db82f4e..1fa27d3 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -19,6 +19,7 @@ add_library( BamTools SHARED BamReader.cpp BamWriter.cpp BGZF.cpp + internal/BamMultiReader_p.cpp internal/BamReader_p.cpp internal/BamStandardIndex_p.cpp internal/BamToolsIndex_p.cpp @@ -36,6 +37,7 @@ add_library( BamTools-static STATIC BamReader.cpp BamWriter.cpp BGZF.cpp + internal/BamMultiReader_p.cpp internal/BamReader_p.cpp internal/BamStandardIndex_p.cpp internal/BamToolsIndex_p.cpp diff --git a/src/api/internal/BamMultiMerger_p.h b/src/api/internal/BamMultiMerger_p.h new file mode 100644 index 0000000..5d1e058 --- /dev/null +++ b/src/api/internal/BamMultiMerger_p.h @@ -0,0 +1,158 @@ +// *************************************************************************** +// BamMultiMerger_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides merging functionality for BamMultiReader. At this point, supports +// sorting results by (refId, position) or by read name. +// *************************************************************************** + +#ifndef BAMMULTIMERGER_P_H +#define BAMMULTIMERGER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include +#include +#include +#include + +namespace BamTools { +namespace Internal { + +typedef std::pair ReaderAlignment; + +// generic MultiMerger interface +class IBamMultiMerger { + + public: + IBamMultiMerger(void) { } + virtual ~IBamMultiMerger(void) { } + + public: + virtual void Add(const ReaderAlignment& value) =0; + virtual void Clear(void) =0; + virtual const ReaderAlignment& First(void) const =0; + virtual const int Size(void) const =0; + virtual ReaderAlignment TakeFirst(void) =0; +}; + +// IBamMultiMerger implementation - sorted on BamAlignment: (RefId, Position) +class PositionMultiMerger : public IBamMultiMerger { + + public: + PositionMultiMerger(void) : IBamMultiMerger() { } + ~PositionMultiMerger(void) { } + + public: + void Add(const ReaderAlignment& value); + void Clear(void); + const ReaderAlignment& First(void) const; + const int Size(void) const; + ReaderAlignment TakeFirst(void); + + private: + typedef std::pair KeyType; + typedef std::multimap IndexType; + typedef std::pair KeyValueType; + typedef IndexType::iterator IndexIterator; + typedef IndexType::const_iterator IndexConstIterator; + + IndexType m_data; +}; + +// IBamMultiMerger implementation - sorted on BamAlignment: Name +class ReadNameMultiMerger : public IBamMultiMerger { + + public: + ReadNameMultiMerger(void) : IBamMultiMerger() { } + ~ReadNameMultiMerger(void) { } + + public: + void Add(const ReaderAlignment& value); + void Clear(void); + const ReaderAlignment& First(void) const; + const int Size(void) const; + ReaderAlignment TakeFirst(void); + + private: + typedef std::string KeyType; + typedef std::multimap IndexType; + typedef std::pair KeyValueType; + typedef IndexType::iterator IndexIterator; + typedef IndexType::const_iterator IndexConstIterator; + + IndexType m_data; +}; + +// ------------------------------------------ +// PositionMultiMerger implementation + +inline void PositionMultiMerger::Add(const ReaderAlignment& value) { + const KeyType key = std::make_pair( value.second->RefID, value.second->Position ); + m_data.insert( KeyValueType(key, value) ); +} + +inline void PositionMultiMerger::Clear(void) { + m_data.clear(); +} + +inline const ReaderAlignment& PositionMultiMerger::First(void) const { + const KeyValueType& entry = (*m_data.begin()); + return entry.second; +} + +inline const int PositionMultiMerger::Size(void) const { + return m_data.size(); +} + +inline ReaderAlignment PositionMultiMerger::TakeFirst(void) { + IndexIterator first = m_data.begin(); + ReaderAlignment next = (*first).second; + m_data.erase(first); + return next; +} + +// ------------------------------------------ +// ReadNameMultiMerger implementation + +inline void ReadNameMultiMerger::Add(const ReaderAlignment& value) { + const KeyType key = value.second->Name; + m_data.insert( KeyValueType(key, value) ); +} + +inline void ReadNameMultiMerger::Clear(void) { + m_data.clear(); +} + +inline const ReaderAlignment& ReadNameMultiMerger::First(void) const { + const KeyValueType& entry = (*m_data.begin()); + return entry.second; +} + +inline const int ReadNameMultiMerger::Size(void) const { + return m_data.size(); +} + +inline ReaderAlignment ReadNameMultiMerger::TakeFirst(void) { + IndexIterator first = m_data.begin(); + ReaderAlignment next = (*first).second; + m_data.erase(first); + return next; +} + +} // namespace Internal +} // namespace BamTools + +#endif // BAMMULTIMERGER_P_H diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp new file mode 100644 index 0000000..01cdf8f --- /dev/null +++ b/src/api/internal/BamMultiReader_p.cpp @@ -0,0 +1,565 @@ +// *************************************************************************** +// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +#include +using namespace std; + +// ctor +BamMultiReaderPrivate::BamMultiReaderPrivate(void) + : m_alignments(0) + , m_isCoreMode(false) + , m_isSortedByPosition(true) +{ } + +// dtor +BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { + + // close all open BAM readers + Close(); + + // clean up alignment cache + delete m_alignments; + m_alignments = 0; +} + +// close the BAM files +void BamMultiReaderPrivate::Close(void) { + + // clear out alignment cache + m_alignments->Clear(); + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + + // close reader + BamReader* reader = (*readerIter).first; + BamAlignment* alignment = (*readerIter).second; + if ( reader ) reader->Close(); + + // delete pointers + delete reader; + reader = 0; + delete alignment; + alignment = 0; + } + + // clear out readers + m_readers.clear(); + + // reset flags + m_isCoreMode = false; + m_isSortedByPosition = true; +} + +// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail +bool BamMultiReaderPrivate::CreateIndexes(bool useStandardIndex) { + + bool result = true; + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + result &= reader->CreateIndex(useStandardIndex); + } + return result; +} + +const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const { + + string readGroup(""); + stringstream headerLineSs(headerLine); + string part; + + // parse @RG header line, looking for the ID: tag + while( getline(headerLineSs, part, '\t') ) { + stringstream partSs(part); + string subtag; + getline(partSs, subtag, ':'); + if ( subtag == "ID" ) { + getline(partSs, readGroup, ':'); + break; + } + } + return readGroup; +} + +// makes a virtual, unified header for all the bam files in the multireader +const string BamMultiReaderPrivate::GetHeaderText(void) const { + + // just spit single header out if only have one reader open + if ( m_readers.size() == 1 ) { + + + vector::const_iterator readerBegin = m_readers.begin(); + const ReaderAlignment& entry = (*readerBegin); + const BamReader* reader = entry.first; + if ( reader == 0 ) return ""; + return reader->GetHeaderText(); + } + + string mergedHeader = ""; + map readGroups; + + // foreach extraction entry (each BAM file) + vector::const_iterator readerBegin = m_readers.begin(); + vector::const_iterator readerIter = readerBegin; + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + + // get header from reader + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + string headerText = reader->GetHeaderText(); + if ( headerText.empty() ) continue; + + // store header text in lines + map currentFileReadGroups; + const vector lines = SplitHeaderText(headerText); + + // iterate over header lines + vector::const_iterator linesIter = lines.begin(); + vector::const_iterator linesEnd = lines.end(); + for ( ; linesIter != linesEnd; ++linesIter ) { + + // get next line from header, skip if empty + const string headerLine = (*linesIter); + if ( headerLine.empty() ) { continue; } + + // if first file, save HD & SQ entries + if ( readerIter == readerBegin ) { + if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { + mergedHeader.append(headerLine.c_str()); + mergedHeader.append(1, '\n'); + } + } + + // (for all files) append RG entries if they are unique + if ( headerLine.find("@RG") == 0 ) { + + // extract read group name from line + const string readGroup = ExtractReadGroup(headerLine); + + // make sure not to duplicate @RG entries + if ( readGroups.find(readGroup) == readGroups.end() ) { + mergedHeader.append(headerLine.c_str() ); + mergedHeader.append(1, '\n'); + readGroups[readGroup] = true; + currentFileReadGroups[readGroup] = true; + } else { + // warn iff we are reading one file and discover duplicated @RG tags in the header + // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags + if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) { + cerr << "WARNING: duplicate @RG tag " << readGroup + << " entry in header of " << reader->GetFilename() << endl; + } + } + } + } + } + + // return merged header text + return mergedHeader; +} + +// get next alignment among all files +bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { + return LoadNextAlignment(al, false); +} + +// get next alignment among all files without parsing character data from alignments +bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { + return LoadNextAlignment(al, true); +} + +// --------------------------------------------------------------------------------------- +// +// NB: The following GetReferenceX() functions assume that we have identical +// references for all BAM files. We enforce this by invoking the above +// validation function (ValidateReaders) to verify that our reference data +// is the same across all files on Open, so we will not encounter a situation +// in which there is a mismatch and we are still live. +// +// --------------------------------------------------------------------------------------- + +// returns the number of reference sequences +const int BamMultiReaderPrivate::GetReferenceCount(void) const { + const ReaderAlignment& firstReader = m_readers.front(); + const BamReader* reader = firstReader.first; + if ( reader ) return reader->GetReferenceCount(); + else return 0; +} + +// returns vector of reference objects +const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { + const ReaderAlignment& firstReader = m_readers.front(); + const BamReader* reader = firstReader.first; + if ( reader ) return reader->GetReferenceData(); + else return RefVector(); +} + +// returns refID from reference name +const int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { + const ReaderAlignment& firstReader = m_readers.front(); + const BamReader* reader = firstReader.first; + if ( reader ) return reader->GetReferenceID(refName); + else return -1; // ERROR case - how to report +} + +// --------------------------------------------------------------------------------------- + +// checks if any readers still have alignments +bool BamMultiReaderPrivate::HasOpenReaders(void) { + return ( m_alignments->Size() > 0 ); +} + +// returns whether underlying BAM readers ALL have an index loaded +// this is useful to indicate whether Jump() or SetRegion() are possible +bool BamMultiReaderPrivate::IsIndexLoaded(void) const { + bool ok = true; + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader ) ok &= reader->IsIndexLoaded(); + } + return ok; +} + +// jumps to specified region(refID, leftBound) in BAM files, returns success/fail +bool BamMultiReaderPrivate::Jump(int refID, int position) { + + bool ok = true; + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + ok &= reader->Jump(refID, position); + if ( !ok ) { + cerr << "ERROR: could not jump " << reader->GetFilename() + << " to " << refID << ":" << position << endl; + exit(1); + } + } + + if (ok) UpdateAlignments(); + return ok; +} + +bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al, bool coreMode) { + + // bail out if no more data to process + if ( !HasOpenReaders() ) return false; + + // "pop" next alignment and reader + ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst(); + BamReader* reader = nextReaderAlignment.first; + BamAlignment* alignment = nextReaderAlignment.second; + + // save it by copy to our argument + al = BamAlignment(*alignment); + + // peek to next alignment & store in cache + m_isCoreMode = coreMode; + SaveNextAlignment(reader,alignment); + + // return success + return true; +} + +// opens BAM files +bool BamMultiReaderPrivate::Open(const vector& filenames, + bool openIndexes, + bool coreMode, + bool preferStandardIndex) +{ + // store core mode flag + m_isCoreMode = coreMode; + + // first clear out any prior alignment cache prior data + if ( m_alignments ) { + m_alignments->Clear(); + delete m_alignments; + m_alignments = 0; + } + + // create alignment cache based on sorting mode + if ( m_isSortedByPosition ) + m_alignments = new PositionMultiMerger; + else + m_alignments = new ReadNameMultiMerger; + + // iterate over filenames + vector::const_iterator filenameIter = filenames.begin(); + vector::const_iterator filenameEnd = filenames.end(); + for ( ; filenameIter != filenameEnd; ++filenameIter ) { + const string filename = (*filenameIter); + + bool openedOk = true; + BamReader* reader = new BamReader; + openedOk = reader->Open(filename, "", openIndexes, preferStandardIndex); + + // if file opened ok + if ( openedOk ) { + + // try to read first alignment + bool fileOk = true; + BamAlignment* alignment = new BamAlignment; + fileOk &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) + : reader->GetNextAlignment(*alignment) ); + + if ( fileOk ) { + + m_readers.push_back( make_pair(reader, alignment) ); + m_alignments->Add( make_pair(reader, alignment) ); + + } else { + cerr << "WARNING: could not read first alignment in " + << filename << ", ignoring file" << endl; + + // if only file available & could not be read, return failure + if ( filenames.size() == 1 ) + return false; + } + } + + // TODO; any further error handling when openedOK is false ?? + else return false; + } + + // files opened ok, at least one alignment could be read, + // now need to check that all files use same reference data + ValidateReaders(); + return true; +} + +// print associated filenames to stdout +void BamMultiReaderPrivate::PrintFilenames(void) const { + + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + cout << reader->GetFilename() << endl; + } +} + +// returns BAM file pointers to beginning of alignment data +bool BamMultiReaderPrivate::Rewind(void) { + + bool result = true; + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + result &= reader->Rewind(); + } + return result; +} + +void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) { + + // must be in core mode && sorting by position to call GNACore() + if ( m_isCoreMode && m_isSortedByPosition ) { + if ( reader->GetNextAlignmentCore(*alignment) ) + m_alignments->Add( make_pair(reader, alignment) ); + } + + // not in core mode and/or sorting by readname, must call GNA() + else { + if ( reader->GetNextAlignment(*alignment) ) + m_alignments->Add( make_pair(reader, alignment) ); + } +} + +// sets the index caching mode on the readers +void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { + + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + reader->SetIndexCacheMode(mode); + } +} + +bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { + + // NB: While it may make sense to track readers in which we can + // successfully SetRegion, In practice a failure of SetRegion means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + if ( !reader->SetRegion(region) ) { + cerr << "ERROR: could not jump " << reader->GetFilename() << " to " + << region.LeftRefID << ":" << region.LeftPosition << ".." + << region.RightRefID << ":" << region.RightPosition << endl; + } + } + + UpdateAlignments(); + return true; +} + +void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) { + + const BamMultiReader::SortOrder oldSortOrder = ( m_isSortedByPosition ? BamMultiReader::SortedByPosition + : BamMultiReader::SortedByReadName ) ; + // skip if no change needed + if ( oldSortOrder == order ) return; + + // create new alignment cache + IBamMultiMerger* newAlignmentCache(0); + if ( order == BamMultiReader::SortedByPosition ) { + newAlignmentCache = new PositionMultiMerger; + m_isSortedByPosition = true; + } + else { + newAlignmentCache = new ReadNameMultiMerger; + m_isSortedByPosition = false; + } + + // copy old cache contents to new cache + while ( m_alignments->Size() > 0 ) { + ReaderAlignment value = m_alignments->TakeFirst(); + newAlignmentCache->Add(value); + } + + // remove old cache structure & point to new cache + delete m_alignments; + m_alignments = newAlignmentCache; +} + +// updates our alignment cache +void BamMultiReaderPrivate::UpdateAlignments(void) { + + // clear the cache + m_alignments->Clear(); + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + BamAlignment* alignment = (*readerIter).second; + if ( reader == 0 ) continue; + SaveNextAlignment(reader, alignment); + } +} + +// splits the entire header into a list of strings +const vector BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const { + stringstream header(headerText); + vector lines; + string item; + while ( getline(header, item) ) + lines.push_back(item); + return lines; +} + +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +void BamMultiReaderPrivate::ValidateReaders(void) const { + + // retrieve first reader data + const BamReader* firstReader = m_readers.front().first; + if ( firstReader == 0 ) return; // signal error? + const RefVector firstReaderRefData = firstReader->GetReferenceData(); + const int firstReaderRefCount = firstReader->GetReferenceCount(); + const int firstReaderRefSize = firstReaderRefData.size(); + + // iterate over all readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + + // get current reader data + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; // error? + const RefVector currentReaderRefData = reader->GetReferenceData(); + const int currentReaderRefCount = reader->GetReferenceCount(); + const int currentReaderRefSize = currentReaderRefData.size(); + + // init container iterators + RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); + RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); + RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); + + // compare reference counts from BamReader ( & container size, in case of BR error) + if ( (currentReaderRefCount != firstReaderRefCount) || + (firstReaderRefSize != currentReaderRefSize) ) + { + cerr << "ERROR: mismatched number of references in " << reader->GetFilename() + << " expected " << firstReaderRefCount + << " reference sequences but only found " << currentReaderRefCount << endl; + exit(1); + } + + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while ( firstRefIter != firstRefEnd ) { + + const RefData& firstRef = (*firstRefIter); + const RefData& currentRef = (*currentRefIter); + + // compare reference name & length + if ( (firstRef.RefName != currentRef.RefName) || + (firstRef.RefLength != currentRef.RefLength) ) + { + cerr << "ERROR: mismatched references found in " << reader->GetFilename() + << " expected: " << endl; + RefVector::const_iterator refIter = firstReaderRefData.begin(); + RefVector::const_iterator refEnd = firstReaderRefData.end(); + for ( ; refIter != refEnd; ++refIter ) { + const RefData& entry = (*refIter); + cerr << entry.RefName << " " << entry.RefLength << endl; + } + + cerr << "but found: " << endl; + refIter = currentReaderRefData.begin(); + refEnd = currentReaderRefData.end(); + for ( ; refIter != refEnd; ++refIter ) { + const RefData& entry = (*refIter); + cerr << entry.RefName << " " << entry.RefLength << endl; + } + + exit(1); + } + + // update iterators + ++firstRefIter; + ++currentRefIter; + } + } +} diff --git a/src/api/internal/BamMultiReader_p.h b/src/api/internal/BamMultiReader_p.h new file mode 100644 index 0000000..971acda --- /dev/null +++ b/src/api/internal/BamMultiReader_p.h @@ -0,0 +1,95 @@ +// *************************************************************************** +// BamMultiReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#ifndef BAMMULTIREADER_P_H +#define BAMMULTIREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include +#include + +namespace BamTools { +namespace Internal { + +class IBamMultiMerger; + +class BamMultiReaderPrivate { + + // constructor / destructor + public: + BamMultiReaderPrivate(void); + ~BamMultiReaderPrivate(void); + + // public interface + public: + + // file operations + void Close(void); + bool Open(const std::vector& filenames, + bool openIndexes = true, + bool coreMode = false, + bool preferStandardIndex = false); + bool IsIndexLoaded(void) const; + bool Jump(int refID, int position = 0); + void PrintFilenames(void) const; + bool SetRegion(const BamRegion& region); + bool Rewind(void); + + // access alignment data + bool GetNextAlignment(BamAlignment& al); + bool GetNextAlignmentCore(BamAlignment& al); + bool HasOpenReaders(void); + void SetSortOrder(const BamMultiReader::SortOrder& order); + + // access auxiliary data + const std::string GetHeaderText(void) const; + const int GetReferenceCount(void) const; + const BamTools::RefVector GetReferenceData(void) const; + const int GetReferenceID(const std::string& refName) const; + + // BAM index operations + bool CreateIndexes(bool useStandardIndex = true); + void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + + // internal methods + private: + const std::string ExtractReadGroup(const std::string& headerLine) const; + bool LoadNextAlignment(BamAlignment& al, bool coreMode); + void SaveNextAlignment(BamTools::BamReader* reader, BamTools::BamAlignment* alignment); + const std::vector SplitHeaderText(const std::string& headerText) const; + // updates our alignment cache + void UpdateAlignments(void); + // validates that we have a congruent set of BAM files that are aligned against the same reference sequences + void ValidateReaders(void) const; + + // data members + public: + typedef std::pair ReaderAlignment; + std::vector m_readers; + + IBamMultiMerger* m_alignments; + bool m_isCoreMode; + bool m_isSortedByPosition; +}; + +} // namesapce Internal +} // namespace BamTools + +#endif // BAMMULTIREADER_P_H -- 2.39.2