X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2Finternal%2FBamMultiReader_p.cpp;h=076f1bba9ac7e5a9e70d79a8e488f87e8f4fe2c2;hb=9f1ce8c47aeadb6dc1320b52ee671c3341b97935;hp=01cdf8f9d2c38a8a5d40946c3897424bfc045e63;hpb=83255b02911cc3e267386e20bcb8747478b4c239;p=bamtools.git diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp index 01cdf8f..076f1bb 100644 --- a/src/api/internal/BamMultiReader_p.cpp +++ b/src/api/internal/BamMultiReader_p.cpp @@ -1,17 +1,17 @@ // *************************************************************************** // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) +// Last modified: 10 October 2011 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* -#include -#include -#include -#include +#include "api/BamAlignment.h" +#include "api/BamMultiReader.h" +#include "api/SamConstants.h" +#include "api/algorithms/Sort.h" +#include "api/internal/BamMultiReader_p.h" using namespace BamTools; using namespace BamTools::Internal; @@ -24,385 +24,617 @@ using namespace std; // ctor BamMultiReaderPrivate::BamMultiReaderPrivate(void) - : m_alignments(0) - , m_isCoreMode(false) - , m_isSortedByPosition(true) + : m_alignmentCache(0) { } // dtor BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { - - // close all open BAM readers Close(); - - // clean up alignment cache - delete m_alignments; - m_alignments = 0; } -// close the BAM files -void BamMultiReaderPrivate::Close(void) { +// close all BAM files +bool BamMultiReaderPrivate::Close(void) { - // clear out alignment cache - m_alignments->Clear(); + m_errorString.clear(); - // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { + if ( CloseFiles(Filenames()) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error encountered while closing all files: \n\t") + currentError; + SetErrorString("BamMultiReader::Close", message); + return false; + } +} + +// close requested BAM file +bool BamMultiReaderPrivate::CloseFile(const string& filename) { - // close reader - BamReader* reader = (*readerIter).first; - BamAlignment* alignment = (*readerIter).second; - if ( reader ) reader->Close(); + m_errorString.clear(); - // delete pointers - delete reader; - reader = 0; - delete alignment; - alignment = 0; + vector filenames(1, filename); + if ( CloseFiles(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error while closing file: ") + filename + "\n" + currentError; + SetErrorString("BamMultiReader::CloseFile", message); + return false; } +} - // clear out readers - m_readers.clear(); +// close requested BAM files +bool BamMultiReaderPrivate::CloseFiles(const vector& filenames) { - // reset flags - m_isCoreMode = false; - m_isSortedByPosition = true; -} + bool errorsEncountered = false; + m_errorString.clear(); -// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail -bool BamMultiReaderPrivate::CreateIndexes(bool useStandardIndex) { + // iterate over filenames + vector::const_iterator filesIter = filenames.begin(); + vector::const_iterator filesEnd = filenames.end(); + for ( ; filesIter != filesEnd; ++filesIter ) { + const string& filename = (*filesIter); + if ( filename.empty() ) continue; + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // if reader matches requested filename + if ( reader->GetFilename() == filename ) { + + // remove reader's entry from alignment cache + m_alignmentCache->Remove(reader); + + // clean up reader & its alignment + if ( !reader->Close() ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + delete reader; + reader = 0; - bool result = true; - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; - result &= reader->CreateIndex(useStandardIndex); + // delete reader's alignment entry + BamAlignment* alignment = item.Alignment; + delete alignment; + alignment = 0; + + // remove reader from reader list + m_readers.erase(readerIter); + + // on match, just go on to next filename + // (no need to keep looking and item iterator is invalid now anyway) + break; + } + } } - return result; + + // make sure alignment cache is cleaned up if all readers closed + if ( m_readers.empty() && m_alignmentCache ) { + m_alignmentCache->Clear(); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // return whether all readers closed OK + return !errorsEncountered; } -const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const { +// creates index files for BAM files that don't have them +bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { - string readGroup(""); - stringstream headerLineSs(headerLine); - string part; + bool errorsEncountered = false; + m_errorString.clear(); - // parse @RG header line, looking for the ID: tag - while( getline(headerLineSs, part, '\t') ) { - stringstream partSs(part); - string subtag; - getline(partSs, subtag, ':'); - if ( subtag == "ID" ) { - getline(partSs, readGroup, ':'); - break; + // iterate over readers + vector::iterator itemIter = m_readers.begin(); + vector::iterator itemEnd = m_readers.end(); + for ( ; itemIter != itemEnd; ++itemIter ) { + MergeItem& item = (*itemIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // if reader doesn't have an index, create one + if ( !reader->HasIndex() ) { + if ( !reader->CreateIndex(type) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } } } - return readGroup; + + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while creating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::CreateIndexes", message); + return false; + } else + return true; } -// makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReaderPrivate::GetHeaderText(void) const { +IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const { + + // fetch SamHeader + SamHeader header = GetHeader(); - // just spit single header out if only have one reader open - if ( m_readers.size() == 1 ) { + // if BAM files are sorted by position + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) + return new MultiMerger(); + // if BAM files are sorted by read name + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) + return new MultiMerger(); + + // otherwise "unknown" or "unsorted", use unsorted merger and just read in + return new MultiMerger(); +} - vector::const_iterator readerBegin = m_readers.begin(); - const ReaderAlignment& entry = (*readerBegin); - const BamReader* reader = entry.first; - if ( reader == 0 ) return ""; - return reader->GetHeaderText(); +const vector BamMultiReaderPrivate::Filenames(void) const { + + // init filename container + vector filenames; + filenames.reserve( m_readers.size() ); + + // iterate over readers + vector::const_iterator itemIter = m_readers.begin(); + vector::const_iterator itemEnd = m_readers.end(); + for ( ; itemIter != itemEnd; ++itemIter ) { + const MergeItem& item = (*itemIter); + const BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // store filename if not empty + const string& filename = reader->GetFilename(); + if ( !filename.empty() ) + filenames.push_back(filename); } - string mergedHeader = ""; - map readGroups; + // return result + return filenames; +} - // foreach extraction entry (each BAM file) - vector::const_iterator readerBegin = m_readers.begin(); - vector::const_iterator readerIter = readerBegin; - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { +string BamMultiReaderPrivate::GetErrorString(void) const { + return m_errorString; +} + +SamHeader BamMultiReaderPrivate::GetHeader(void) const { + const string& text = GetHeaderText(); + return SamHeader(text); +} - // get header from reader - const BamReader* reader = (*readerIter).first; +// makes a virtual, unified header for all the bam files in the multireader +string BamMultiReaderPrivate::GetHeaderText(void) const { + + // N.B. - right now, simply copies all header data from first BAM, + // and then appends RG's from other BAM files + // TODO: make this more intelligent wrt other header lines/fields + + // if no readers open + const size_t numReaders = m_readers.size(); + if ( numReaders == 0 ) return string(); + + // retrieve first reader's header + const MergeItem& firstItem = m_readers.front(); + const BamReader* reader = firstItem.Reader; + if ( reader == 0 ) return string(); + SamHeader mergedHeader = reader->GetHeader(); + + // iterate over any remaining readers (skipping the first) + for ( size_t i = 1; i < numReaders; ++i ) { + const MergeItem& item = m_readers.at(i); + const BamReader* reader = item.Reader; if ( reader == 0 ) continue; - string headerText = reader->GetHeaderText(); - if ( headerText.empty() ) continue; - - // store header text in lines - map currentFileReadGroups; - const vector lines = SplitHeaderText(headerText); - - // iterate over header lines - vector::const_iterator linesIter = lines.begin(); - vector::const_iterator linesEnd = lines.end(); - for ( ; linesIter != linesEnd; ++linesIter ) { - - // get next line from header, skip if empty - const string headerLine = (*linesIter); - if ( headerLine.empty() ) { continue; } - - // if first file, save HD & SQ entries - if ( readerIter == readerBegin ) { - if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { - mergedHeader.append(headerLine.c_str()); - mergedHeader.append(1, '\n'); - } - } - // (for all files) append RG entries if they are unique - if ( headerLine.find("@RG") == 0 ) { - - // extract read group name from line - const string readGroup = ExtractReadGroup(headerLine); - - // make sure not to duplicate @RG entries - if ( readGroups.find(readGroup) == readGroups.end() ) { - mergedHeader.append(headerLine.c_str() ); - mergedHeader.append(1, '\n'); - readGroups[readGroup] = true; - currentFileReadGroups[readGroup] = true; - } else { - // warn iff we are reading one file and discover duplicated @RG tags in the header - // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags - if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) { - cerr << "WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; - } - } - } - } + // retrieve current reader's header + const SamHeader currentHeader = reader->GetHeader(); + + // append current reader's RG entries to merged header + // N.B. - SamReadGroupDictionary handles duplicate-checking + mergedHeader.ReadGroups.Add(currentHeader.ReadGroups); + + // TODO: merge anything else?? } - // return merged header text - return mergedHeader; + // return stringified header + return mergedHeader.ToString(); } // get next alignment among all files bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { - return LoadNextAlignment(al, false); + return PopNextCachedAlignment(al, true); } // get next alignment among all files without parsing character data from alignments bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { - return LoadNextAlignment(al, true); + return PopNextCachedAlignment(al, false); } // --------------------------------------------------------------------------------------- // // NB: The following GetReferenceX() functions assume that we have identical -// references for all BAM files. We enforce this by invoking the above -// validation function (ValidateReaders) to verify that our reference data -// is the same across all files on Open, so we will not encounter a situation -// in which there is a mismatch and we are still live. +// references for all BAM files. We enforce this by invoking the +// ValidateReaders() method to verify that our reference data is the same +// across all files on Open - so we will not encounter a situation in which +// there is a mismatch and we are still live. // // --------------------------------------------------------------------------------------- // returns the number of reference sequences -const int BamMultiReaderPrivate::GetReferenceCount(void) const { - const ReaderAlignment& firstReader = m_readers.front(); - const BamReader* reader = firstReader.first; - if ( reader ) return reader->GetReferenceCount(); - else return 0; +int BamMultiReaderPrivate::GetReferenceCount(void) const { + + // handle empty multireader + if ( m_readers.empty() ) return 0; + + // return reference count from first reader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return 0; + else + return reader->GetReferenceCount(); } // returns vector of reference objects const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { - const ReaderAlignment& firstReader = m_readers.front(); - const BamReader* reader = firstReader.first; - if ( reader ) return reader->GetReferenceData(); - else return RefVector(); + + // handle empty multireader + if ( m_readers.empty() ) return RefVector(); + + // return reference data from first BamReader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return RefVector(); + else + return reader->GetReferenceData(); } // returns refID from reference name -const int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { - const ReaderAlignment& firstReader = m_readers.front(); - const BamReader* reader = firstReader.first; - if ( reader ) return reader->GetReferenceID(refName); - else return -1; // ERROR case - how to report -} +int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { -// --------------------------------------------------------------------------------------- + // handle empty multireader + if ( m_readers.empty() ) return -1; -// checks if any readers still have alignments -bool BamMultiReaderPrivate::HasOpenReaders(void) { - return ( m_alignments->Size() > 0 ); + // return reference ID from first BamReader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return -1; + else + return reader->GetReferenceID(refName); } +// --------------------------------------------------------------------------------------- -// returns whether underlying BAM readers ALL have an index loaded +// returns true if all readers have index data available // this is useful to indicate whether Jump() or SetRegion() are possible -bool BamMultiReaderPrivate::IsIndexLoaded(void) const { - bool ok = true; - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); +bool BamMultiReaderPrivate::HasIndexes(void) const { + + // handle empty multireader + if ( m_readers.empty() ) + return false; + + bool result = true; + + // iterate over readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; - if ( reader ) ok &= reader->IsIndexLoaded(); + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // see if current reader has index data + result &= reader->HasIndex(); } - return ok; + + return result; } -// jumps to specified region(refID, leftBound) in BAM files, returns success/fail -bool BamMultiReaderPrivate::Jump(int refID, int position) { +// returns true if multireader has open readers +bool BamMultiReaderPrivate::HasOpenReaders(void) { - bool ok = true; - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + // iterate over readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; if ( reader == 0 ) continue; - ok &= reader->Jump(refID, position); - if ( !ok ) { - cerr << "ERROR: could not jump " << reader->GetFilename() - << " to " << refID << ":" << position << endl; - exit(1); - } + // return true whenever an open reader is found + if ( reader->IsOpen() ) return true; } - if (ok) UpdateAlignments(); - return ok; + // no readers open + return false; } -bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al, bool coreMode) { +// performs random-access jump using (refID, position) as a left-bound +bool BamMultiReaderPrivate::Jump(int refID, int position) { - // bail out if no more data to process - if ( !HasOpenReaders() ) return false; + // NB: While it may make sense to track readers in which we can + // successfully Jump, in practice a failure of Jump means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; - // "pop" next alignment and reader - ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst(); - BamReader* reader = nextReaderAlignment.first; - BamAlignment* alignment = nextReaderAlignment.second; + // jump in each BamReader to position of interest + reader->Jump(refID, position); + } - // save it by copy to our argument - al = BamAlignment(*alignment); + // returns status of cache update + return UpdateAlignmentCache(); +} - // peek to next alignment & store in cache - m_isCoreMode = coreMode; - SaveNextAlignment(reader,alignment); +// locate (& load) index files for BAM readers that don't already have one loaded +bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) { - // return success - return true; + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // if reader has no index, try to locate one + if ( !reader->HasIndex() ) { + if ( !reader->LocateIndex(preferredType) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + } + + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while locating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::LocatingIndexes", message); + return false; + } else + return true; } // opens BAM files -bool BamMultiReaderPrivate::Open(const vector& filenames, - bool openIndexes, - bool coreMode, - bool preferStandardIndex) -{ - // store core mode flag - m_isCoreMode = coreMode; - - // first clear out any prior alignment cache prior data - if ( m_alignments ) { - m_alignments->Clear(); - delete m_alignments; - m_alignments = 0; - } +bool BamMultiReaderPrivate::Open(const vector& filenames) { - // create alignment cache based on sorting mode - if ( m_isSortedByPosition ) - m_alignments = new PositionMultiMerger; - else - m_alignments = new ReadNameMultiMerger; + m_errorString.clear(); + + // put all current readers back at beginning (refreshes alignment cache) + if ( !Rewind() ) { + const string currentError = m_errorString; + const string message = string("unable to rewind existing readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } // iterate over filenames + bool errorsEncountered = false; vector::const_iterator filenameIter = filenames.begin(); vector::const_iterator filenameEnd = filenames.end(); for ( ; filenameIter != filenameEnd; ++filenameIter ) { - const string filename = (*filenameIter); + const string& filename = (*filenameIter); + if ( filename.empty() ) continue; - bool openedOk = true; + // attempt to open BamReader BamReader* reader = new BamReader; - openedOk = reader->Open(filename, "", openIndexes, preferStandardIndex); + const bool readerOpened = reader->Open(filename); + + // if opened OK, store it + if ( readerOpened ) + m_readers.push_back( MergeItem(reader, new BamAlignment) ); - // if file opened ok - if ( openedOk ) { + // otherwise store error & clean up invalid reader + else { + m_errorString.append(1, '\t'); + m_errorString += string("unable to open file: ") + filename; + m_errorString.append(1, '\n'); + errorsEncountered = true; - // try to read first alignment - bool fileOk = true; - BamAlignment* alignment = new BamAlignment; - fileOk &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) - : reader->GetNextAlignment(*alignment) ); + delete reader; + reader = 0; + } + } - if ( fileOk ) { + // check for errors while opening + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("unable to open all files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } - m_readers.push_back( make_pair(reader, alignment) ); - m_alignments->Add( make_pair(reader, alignment) ); + // check for BAM file consistency + if ( !ValidateReaders() ) { + const string currentError = m_errorString; + const string message = string("unable to open inconsistent files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } - } else { - cerr << "WARNING: could not read first alignment in " - << filename << ", ignoring file" << endl; + // update alignment cache + return UpdateAlignmentCache(); +} + +bool BamMultiReaderPrivate::OpenFile(const std::string& filename) { + vector filenames(1, filename); + if ( Open(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("could not open file: ") + filename + "\n\t" + currentError; + SetErrorString("BamMultiReader::OpenFile", message); + return false; + } +} - // if only file available & could not be read, return failure - if ( filenames.size() == 1 ) - return false; +bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { + + // TODO: This needs to be cleaner - should not assume same order. + // And either way, shouldn't start at first reader. Should start at + // first reader without an index? + + // make sure same number of index filenames as readers + if ( m_readers.size() != indexFilenames.size() ) { + const string message("size of index file list does not match current BAM file count"); + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over BamReaders + vector::const_iterator indexFilenameIter = indexFilenames.begin(); + vector::const_iterator indexFilenameEnd = indexFilenames.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + + // open index filename on reader + if ( reader ) { + const string& indexFilename = (*indexFilenameIter); + if ( !reader->OpenIndex(indexFilename) ) { + m_errorString.append(1, '\t'); + m_errorString += reader->GetErrorString(); + m_errorString.append(1, '\n'); + errorsEncountered = true; } } - // TODO; any further error handling when openedOK is false ?? - else return false; + // increment filename iterator, skip if no more index files to open + if ( ++indexFilenameIter == indexFilenameEnd ) + break; } - // files opened ok, at least one alignment could be read, - // now need to check that all files use same reference data - ValidateReaders(); + // return success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("could not open all index files: \n\t") + currentError; + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } else + return true; +} + +bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) { + + // skip if no alignments available + if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() ) + return false; + + // pop next merge item entry from cache + MergeItem item = m_alignmentCache->TakeFirst(); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; + if ( reader == 0 || alignment == 0 ) + return false; + + // set char data if requested + if ( needCharData ) { + alignment->BuildCharData(); + alignment->Filename = reader->GetFilename(); + } + + // store cached alignment into destination parameter (by copy) + al = *alignment; + + // load next alignment from reader & store in cache + SaveNextAlignment(reader, alignment); return true; } -// print associated filenames to stdout -void BamMultiReaderPrivate::PrintFilenames(void) const { +// returns BAM file pointers to beginning of alignment data & resets alignment cache +bool BamMultiReaderPrivate::Rewind(void) { - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; - if ( reader == 0 ) continue; - cout << reader->GetFilename() << endl; + // attempt to rewind files + if ( !RewindReaders() ) { + const string currentError = m_errorString; + const string message = string("could not rewind readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Rewind", message); + return false; } + + // return status of cache update + return UpdateAlignmentCache(); } // returns BAM file pointers to beginning of alignment data -bool BamMultiReaderPrivate::Rewind(void) { +bool BamMultiReaderPrivate::RewindReaders(void) { - bool result = true; - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + m_errorString.clear(); + bool errorsEncountered = false; + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; - result &= reader->Rewind(); + + // attempt rewind on BamReader + if ( !reader->Rewind() ) { + m_errorString.append(1, '\t'); + m_errorString.append( reader->GetErrorString() ); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } } - return result; + + return !errorsEncountered; } void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) { - // must be in core mode && sorting by position to call GNACore() - if ( m_isCoreMode && m_isSortedByPosition ) { - if ( reader->GetNextAlignmentCore(*alignment) ) - m_alignments->Add( make_pair(reader, alignment) ); - } + // if can read alignment from reader, store in cache + // + // N.B. - lazy building of alignment's char data - populated only: + // automatically by alignment cache to maintain its sorting OR + // on demand from client call to future call to GetNextAlignment() - // not in core mode and/or sorting by readname, must call GNA() - else { - if ( reader->GetNextAlignment(*alignment) ) - m_alignments->Add( make_pair(reader, alignment) ); - } + if ( reader->GetNextAlignmentCore(*alignment) ) + m_alignmentCache->Add( MergeItem(reader, alignment) ); +} + +void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const { + static const string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; } // sets the index caching mode on the readers -void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { +void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) { - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; + + // set reader's index cache mode reader->SetIndexCacheMode(mode); } } @@ -414,104 +646,107 @@ bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { // alignments here." It makes sense to simply accept the failure, // UpdateAlignments(), and continue. - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + // iterate over alignments + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; - if ( !reader->SetRegion(region) ) { - cerr << "ERROR: could not jump " << reader->GetFilename() << " to " - << region.LeftRefID << ":" << region.LeftPosition << ".." - << region.RightRefID << ":" << region.RightPosition << endl; - } - } - - UpdateAlignments(); - return true; -} - -void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) { - - const BamMultiReader::SortOrder oldSortOrder = ( m_isSortedByPosition ? BamMultiReader::SortedByPosition - : BamMultiReader::SortedByReadName ) ; - // skip if no change needed - if ( oldSortOrder == order ) return; - - // create new alignment cache - IBamMultiMerger* newAlignmentCache(0); - if ( order == BamMultiReader::SortedByPosition ) { - newAlignmentCache = new PositionMultiMerger; - m_isSortedByPosition = true; - } - else { - newAlignmentCache = new ReadNameMultiMerger; - m_isSortedByPosition = false; - } - // copy old cache contents to new cache - while ( m_alignments->Size() > 0 ) { - ReaderAlignment value = m_alignments->TakeFirst(); - newAlignmentCache->Add(value); + // set region of interest + reader->SetRegion(region); } - // remove old cache structure & point to new cache - delete m_alignments; - m_alignments = newAlignmentCache; + // return status of cache update + return UpdateAlignmentCache(); } // updates our alignment cache -void BamMultiReaderPrivate::UpdateAlignments(void) { +bool BamMultiReaderPrivate::UpdateAlignmentCache(void) { + + // create alignment cache if not created yet + if ( m_alignmentCache == 0 ) { + m_alignmentCache = CreateAlignmentCache(); + if ( m_alignmentCache == 0 ) { + SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache"); + return false; + } + } - // clear the cache - m_alignments->Clear(); + // clear any prior cache data + m_alignmentCache->Clear(); // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; - BamAlignment* alignment = (*readerIter).second; - if ( reader == 0 ) continue; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; + if ( reader == 0 || alignment == 0 ) continue; + + // save next alignment from each reader in cache SaveNextAlignment(reader, alignment); } -} -// splits the entire header into a list of strings -const vector BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const { - stringstream header(headerText); - vector lines; - string item; - while ( getline(header, item) ) - lines.push_back(item); - return lines; + // if we get here, ok + return true; } // ValidateReaders checks that all the readers point to BAM files representing // alignments against the same set of reference sequences, and that the // sequences are identically ordered. If these checks fail the operation of // the multireader is undefined, so we force program exit. -void BamMultiReaderPrivate::ValidateReaders(void) const { +bool BamMultiReaderPrivate::ValidateReaders(void) const { + + m_errorString.clear(); + + // skip if 0 or 1 readers opened + if ( m_readers.empty() || (m_readers.size() == 1) ) + return true; - // retrieve first reader data - const BamReader* firstReader = m_readers.front().first; - if ( firstReader == 0 ) return; // signal error? - const RefVector firstReaderRefData = firstReader->GetReferenceData(); + // retrieve first reader + const MergeItem& firstItem = m_readers.front(); + const BamReader* firstReader = firstItem.Reader; + if ( firstReader == 0 ) return false; + + // retrieve first reader's header data + const SamHeader& firstReaderHeader = firstReader->GetHeader(); + const string& firstReaderSortOrder = firstReaderHeader.SortOrder; + + // retrieve first reader's reference data + const RefVector& firstReaderRefData = firstReader->GetReferenceData(); const int firstReaderRefCount = firstReader->GetReferenceCount(); const int firstReaderRefSize = firstReaderRefData.size(); // iterate over all readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { + const MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // get current reader's header data + const SamHeader& currentReaderHeader = reader->GetHeader(); + const string& currentReaderSortOrder = currentReaderHeader.SortOrder; + + // check compatible sort order + if ( currentReaderSortOrder != firstReaderSortOrder ) { + const string message = string("mismatched sort order in ") + reader->GetFilename() + + ", expected " + firstReaderSortOrder + + ", but found " + currentReaderSortOrder; + SetErrorString("BamMultiReader::ValidateReaders", message); + return false; + } - // get current reader data - BamReader* reader = (*readerIter).first; - if ( reader == 0 ) continue; // error? + // get current reader's reference data const RefVector currentReaderRefData = reader->GetReferenceData(); const int currentReaderRefCount = reader->GetReferenceCount(); const int currentReaderRefSize = currentReaderRefData.size(); - // init container iterators + // init reference data iterators RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); @@ -520,16 +755,17 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { if ( (currentReaderRefCount != firstReaderRefCount) || (firstReaderRefSize != currentReaderRefSize) ) { - cerr << "ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstReaderRefCount - << " reference sequences but only found " << currentReaderRefCount << endl; - exit(1); + stringstream s(""); + s << "mismatched reference count in " << reader->GetFilename() + << ", expected " << firstReaderRefCount + << ", but found " << currentReaderRefCount; + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; } // this will be ok; we just checked above that we have identically-sized sets of references // here we simply check if they are all, in fact, equal in content while ( firstRefIter != firstRefEnd ) { - const RefData& firstRef = (*firstRefIter); const RefData& currentRef = (*currentRefIter); @@ -537,24 +773,31 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { if ( (firstRef.RefName != currentRef.RefName) || (firstRef.RefLength != currentRef.RefLength) ) { - cerr << "ERROR: mismatched references found in " << reader->GetFilename() - << " expected: " << endl; + stringstream s(""); + s << "mismatched references found in" << reader->GetFilename() + << "expected: " << endl; + + // print first reader's reference data RefVector::const_iterator refIter = firstReaderRefData.begin(); RefVector::const_iterator refEnd = firstReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { const RefData& entry = (*refIter); - cerr << entry.RefName << " " << entry.RefLength << endl; + stringstream s(""); + s << entry.RefName << " " << endl; } - cerr << "but found: " << endl; + s << "but found: " << endl; + + // print current reader's reference data refIter = currentReaderRefData.begin(); refEnd = currentReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { const RefData& entry = (*refIter); - cerr << entry.RefName << " " << entry.RefLength << endl; + s << entry.RefName << " " << entry.RefLength << endl; } - exit(1); + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; } // update iterators @@ -562,4 +805,7 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { ++currentRefIter; } } + + // if we get here, everything checks out + return true; }