From 54b237d50703e5c5c2a541fe425637446354f769 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 10 Jun 2010 13:33:28 -0400 Subject: [PATCH 1/1] change merger to use GetNextAlignmentCore This provides a modest performance boost to the merger. A small change to the BamAlignment copy constructor was required (to copy BamAlignmentSupportData). --- BamAux.h | 1 + BamMultiReader.cpp | 57 ++++++++++++++++++++++++++++++++++++++-------- BamMultiReader.h | 17 +++++++++++--- bamtools_merge.cpp | 16 ++++--------- 4 files changed, 67 insertions(+), 24 deletions(-) diff --git a/BamAux.h b/BamAux.h index 5ef34fe..6777dbc 100644 --- a/BamAux.h +++ b/BamAux.h @@ -272,6 +272,7 @@ BamAlignment::BamAlignment(const BamAlignment& other) , MateRefID(other.MateRefID) , MatePosition(other.MatePosition) , InsertSize(other.InsertSize) + , SupportData(other.SupportData) { } inline diff --git a/BamMultiReader.cpp b/BamMultiReader.cpp index 51372c3..f0a8ced 100644 --- a/BamMultiReader.cpp +++ b/BamMultiReader.cpp @@ -80,7 +80,7 @@ bool BamMultiReader::HasOpenReaders() { return alignments.size() > 0; } -// get next alignment among all files (from specified region, if given) +// get next alignment among all files bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { // bail out if we are at EOF in all files, means no more alignments to process @@ -92,24 +92,59 @@ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { UpdateReferenceID(); // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* lowestAlignment = alignments.begin()->second.second; - BamReader* lowestReader = alignments.begin()->second.first; + BamAlignment* alignment = alignments.begin()->second.second; + BamReader* reader = alignments.begin()->second.first; // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*lowestAlignment); + nextAlignment = BamAlignment(*alignment); // remove this alignment index entry from our alignment index alignments.erase(alignments.begin()); // and add another entry if we can get another alignment from the reader - if (lowestReader->GetNextAlignment(*lowestAlignment)) { - alignments.insert(make_pair(make_pair(lowestAlignment->RefID, lowestAlignment->Position), - make_pair(lowestReader, lowestAlignment))); + if (reader->GetNextAlignment(*alignment)) { + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); } else { // do nothing //cerr << "reached end of file " << lowestReader->GetFilename() << endl; } return true; + +} + +// get next alignment among all files without parsing character data from alignments +bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { + + // bail out if we are at EOF in all files, means no more alignments to process + if (!HasOpenReaders()) + return false; + + // when all alignments have stepped into a new target sequence, update our + // current reference sequence id + UpdateReferenceID(); + + // our lowest alignment and reader will be at the front of our alignment index + BamAlignment* alignment = alignments.begin()->second.second; + BamReader* reader = alignments.begin()->second.first; + + // now that we have the lowest alignment in the set, save it by copy to our argument + nextAlignment = BamAlignment(*alignment); + //memcpy(&nextAlignment, alignment, sizeof(BamAlignment)); + + // remove this alignment index entry from our alignment index + alignments.erase(alignments.begin()); + + // and add another entry if we can get another alignment from the reader + if (reader->GetNextAlignmentCore(*alignment)) { + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { // do nothing + //cerr << "reached end of file " << lowestReader->GetFilename() << endl; + } + + return true; + } // jumps to specified region(refID, leftBound) in BAM files, returns success/fail @@ -146,7 +181,7 @@ bool BamMultiReader::Jump(int refID, int position) { } // opens BAM files -void BamMultiReader::Open(const vector filenames, bool openIndexes) { +void BamMultiReader::Open(const vector filenames, bool openIndexes, bool coreMode) { // for filename in filenames fileNames = filenames; // save filenames in our multireader for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { @@ -158,7 +193,11 @@ void BamMultiReader::Open(const vector filenames, bool openIndexes) { reader->Open(filename); // for merging, jumping is disallowed } BamAlignment* alignment = new BamAlignment; - reader->GetNextAlignment(*alignment); + if (coreMode) { + reader->GetNextAlignmentCore(*alignment); + } else { + reader->GetNextAlignment(*alignment); + } readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), make_pair(reader, alignment))); diff --git a/BamMultiReader.h b/BamMultiReader.h index 6d5a805..3d9024c 100644 --- a/BamMultiReader.h +++ b/BamMultiReader.h @@ -49,11 +49,18 @@ class BamMultiReader { // close BAM files void Close(void); + + // opens BAM files (and optional BAM index files, if provided) + // @openIndexes - triggers index opening, useful for suppressing + // error messages during merging of files in which we may not have + // indexes. + // @coreMode - setup our first alignments using GetNextAlignmentCore(); + // also useful for merging + void Open(const vector filenames, bool openIndexes = true, bool coreMode = false); + // performs random-access jump to reference, position bool Jump(int refID, int position = 0); - // opens BAM files (and optional BAM index files, if provided) - //void Open(const vector filenames, const vector indexFilenames); - void Open(const vector filenames, bool openIndexes = true); + // returns file pointers to beginning of alignments bool Rewind(void); @@ -65,6 +72,10 @@ class BamMultiReader { // retrieves next available alignment (returns success/fail) from all files bool GetNextAlignment(BamAlignment&); + // retrieves next available alignment (returns success/fail) from all files + // and populates the support data with information about the alignment + // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT + bool GetNextAlignmentCore(BamAlignment&); // ... should this be private? bool HasOpenReaders(void); diff --git a/bamtools_merge.cpp b/bamtools_merge.cpp index bfb5de7..dcea172 100644 --- a/bamtools_merge.cpp +++ b/bamtools_merge.cpp @@ -88,11 +88,8 @@ int MergeTool::Run(int argc, char* argv[]) { if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn()); // opens the BAM files without checking for indexes -// BamMultiReader reader; -// reader.Open(m_settings->InputFiles, false); - - BamReader reader; - reader.Open(m_settings->InputFiles.at(0)); + BamMultiReader reader; + reader.Open(m_settings->InputFiles, false, true); // retrieve header & reference dictionary info std::string mergedHeader = reader.GetHeaderText(); @@ -103,16 +100,11 @@ int MergeTool::Run(int argc, char* argv[]) { writer.Open(m_settings->OutputFilename, mergedHeader, references); // store alignments to output file -// BamAlignment bAlignment; -// while (reader.GetNextAlignment(bAlignment)) { -// writer.SaveAlignment(bAlignment); -// } - BamAlignment bAlignment; - while (reader.GetNextAlignment(bAlignment)) { + while (reader.GetNextAlignmentCore(bAlignment)) { writer.SaveAlignment(bAlignment); } - + // clean & exit reader.Close(); writer.Close(); -- 2.39.2