X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2Finternal%2FBamMultiReader_p.cpp;h=f92258ddb856839d8c68721ae0af77990a8d101a;hb=2e049ed7f28881bce09653e60f5aea54bfd7afbf;hp=583085c7cd11782b68337f483113cd8ef702110a;hpb=8c80d760637f8df39262683cd2570f0589423d36;p=bamtools.git diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp index 583085c..f92258d 100644 --- a/src/api/internal/BamMultiReader_p.cpp +++ b/src/api/internal/BamMultiReader_p.cpp @@ -1,16 +1,16 @@ // *************************************************************************** // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* #include #include -#include +#include +#include #include using namespace BamTools; using namespace BamTools::Internal; @@ -24,35 +24,50 @@ using namespace std; // ctor BamMultiReaderPrivate::BamMultiReaderPrivate(void) - : m_alignments(0) - , m_isCoreMode(false) - , m_sortOrder(BamMultiReader::SortedByPosition) + : m_alignmentCache(0) { } // dtor BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { - - // close all open BAM readers Close(); - - // clean up alignment cache - delete m_alignments; - m_alignments = 0; } // close all BAM files -void BamMultiReaderPrivate::Close(void) { - CloseFiles( Filenames() ); +bool BamMultiReaderPrivate::Close(void) { + + m_errorString.clear(); + + if ( CloseFiles(Filenames()) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error encountered while closing all files: \n\t") + currentError; + SetErrorString("BamMultiReader::Close", message); + return false; + } } // close requested BAM file -void BamMultiReaderPrivate::CloseFile(const string& filename) { +bool BamMultiReaderPrivate::CloseFile(const string& filename) { + + m_errorString.clear(); + vector filenames(1, filename); - CloseFiles(filenames); + if ( CloseFiles(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error while closing file: ") + filename + "\n" + currentError; + SetErrorString("BamMultiReader::CloseFile", message); + return false; + } } // close requested BAM files -void BamMultiReaderPrivate::CloseFiles(const vector& filenames) { +bool BamMultiReaderPrivate::CloseFiles(const vector& filenames) { + + bool errorsEncountered = false; + m_errorString.clear(); // iterate over filenames vector::const_iterator filesIter = filenames.begin(); @@ -62,91 +77,105 @@ void BamMultiReaderPrivate::CloseFiles(const vector& filenames) { if ( filename.empty() ) continue; // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; // if reader matches requested filename if ( reader->GetFilename() == filename ) { - // remove reader/alignment from alignment cache - m_alignments->Remove(reader); + // remove reader's entry from alignment cache + m_alignmentCache->Remove(reader); - // close & delete reader - reader->Close(); + // clean up reader & its alignment + if ( !reader->Close() ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } delete reader; reader = 0; // delete reader's alignment entry - BamAlignment* alignment = (*readerIter).second; + BamAlignment* alignment = item.Alignment; delete alignment; alignment = 0; - // remove reader from container + // remove reader from reader list m_readers.erase(readerIter); // on match, just go on to next filename - // (no need to keep looking and iterator is invalid now anyway) + // (no need to keep looking and item iterator is invalid now anyway) break; } } } - // make sure alignment cache is cleared if all readers are now closed - if ( m_readers.empty() && m_alignments != 0 ) - m_alignments->Clear(); + // make sure alignment cache is cleaned up if all readers closed + if ( m_readers.empty() && m_alignmentCache ) { + m_alignmentCache->Clear(); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // return whether all readers closed OK + return !errorsEncountered; } // creates index files for BAM files that don't have them bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { - bool result = true; + bool errorsEncountered = false; + m_errorString.clear(); // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + vector::iterator itemIter = m_readers.begin(); + vector::iterator itemEnd = m_readers.end(); + for ( ; itemIter != itemEnd; ++itemIter ) { + MergeItem& item = (*itemIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; // if reader doesn't have an index, create one - if ( !reader->HasIndex() ) - result &= reader->CreateIndex(type); + if ( !reader->HasIndex() ) { + if ( !reader->CreateIndex(type) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } } - return result; + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while creating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::CreateIndexes", message); + return false; + } else + return true; } -IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const { - switch ( m_sortOrder ) { - case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger; - case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger; - case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger; - default : - cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl; - return 0; - } -} +IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const { -const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const { + // fetch SamHeader + SamHeader header = GetHeader(); - string readGroup(""); - stringstream headerLineSs(headerLine); - string part; + // if BAM files are sorted by position + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) + return new MultiMerger(); - // parse @RG header line, looking for the ID: tag - while( getline(headerLineSs, part, '\t') ) { - stringstream partSs(part); - string subtag; - getline(partSs, subtag, ':'); - if ( subtag == "ID" ) { - getline(partSs, readGroup, ':'); - break; - } - } - return readGroup; + // if BAM files are sorted by read name + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) + return new MultiMerger(); + + // otherwise "unknown" or "unsorted", use unsorted merger and just read in + return new MultiMerger(); } const vector BamMultiReaderPrivate::Filenames(void) const { @@ -156,117 +185,77 @@ const vector BamMultiReaderPrivate::Filenames(void) const { filenames.reserve( m_readers.size() ); // iterate over readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; + vector::const_iterator itemIter = m_readers.begin(); + vector::const_iterator itemEnd = m_readers.end(); + for ( ; itemIter != itemEnd; ++itemIter ) { + const MergeItem& item = (*itemIter); + const BamReader* reader = item.Reader; if ( reader == 0 ) continue; // store filename if not empty - const string filename = reader->GetFilename(); + const string& filename = reader->GetFilename(); if ( !filename.empty() ) - filenames.push_back( reader->GetFilename() ); + filenames.push_back(filename); } // return result return filenames; } +string BamMultiReaderPrivate::GetErrorString(void) const { + return m_errorString; +} + SamHeader BamMultiReaderPrivate::GetHeader(void) const { - string text = GetHeaderText(); + const string& text = GetHeaderText(); return SamHeader(text); } // makes a virtual, unified header for all the bam files in the multireader string BamMultiReaderPrivate::GetHeaderText(void) const { - // if only one reader is open - if ( m_readers.size() == 1 ) { + // N.B. - right now, simply copies all header data from first BAM, + // and then appends RG's from other BAM files + // TODO: make this more intelligent wrt other header lines/fields - // just return reader's header text - const ReaderAlignment& ra = m_readers.front(); - const BamReader* reader = ra.first; - if ( reader ) return reader->GetHeaderText(); + // if no readers open + const size_t numReaders = m_readers.size(); + if ( numReaders == 0 ) return string(); - // invalid reader - return string(); - } + // retrieve first reader's header + const MergeItem& firstItem = m_readers.front(); + const BamReader* reader = firstItem.Reader; + if ( reader == 0 ) return string(); + SamHeader mergedHeader = reader->GetHeader(); - string mergedHeader(""); - map readGroups; - - // foreach extraction entry (each BAM file) - vector::const_iterator readerBegin = m_readers.begin(); - vector::const_iterator readerIter = readerBegin; - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; + // iterate over any remaining readers (skipping the first) + for ( size_t i = 1; i < numReaders; ++i ) { + const MergeItem& item = m_readers.at(i); + const BamReader* reader = item.Reader; if ( reader == 0 ) continue; - // get header from reader - string headerText = reader->GetHeaderText(); - if ( headerText.empty() ) continue; - - // store header text in lines - map currentFileReadGroups; - const vector lines = SplitHeaderText(headerText); - - // iterate over header lines - vector::const_iterator linesIter = lines.begin(); - vector::const_iterator linesEnd = lines.end(); - for ( ; linesIter != linesEnd; ++linesIter ) { - - // get next line from header, skip if empty - const string headerLine = (*linesIter); - if ( headerLine.empty() ) continue; - - // if first file, save HD & SQ entries - // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ? - if ( readerIter == readerBegin ) { - if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { - mergedHeader.append(headerLine.c_str()); - mergedHeader.append(1, '\n'); - } - } + // retrieve current reader's header + const SamHeader currentHeader = reader->GetHeader(); - // (for all files) append RG entries if they are unique - if ( headerLine.find("@RG") == 0 ) { - - // extract read group name from line - const string readGroup = ExtractReadGroup(headerLine); - - // make sure not to duplicate @RG entries - if ( readGroups.find(readGroup) == readGroups.end() ) { - mergedHeader.append(headerLine.c_str() ); - mergedHeader.append(1, '\n'); - readGroups[readGroup] = true; - currentFileReadGroups[readGroup] = true; - } else { - // warn iff we are reading one file and discover duplicated @RG tags in the header - // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags - if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) { - cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; - } - } - } - } + // append current reader's RG entries to merged header + // N.B. - SamReadGroupDictionary handles duplicate-checking + mergedHeader.ReadGroups.Add(currentHeader.ReadGroups); + + // TODO: merge anything else?? } - // return merged header text - return mergedHeader; + // return stringified header + return mergedHeader.ToString(); } // get next alignment among all files bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { - m_isCoreMode = false; - return LoadNextAlignment(al); + return PopNextCachedAlignment(al, true); } // get next alignment among all files without parsing character data from alignments bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { - m_isCoreMode = true; - return LoadNextAlignment(al); + return PopNextCachedAlignment(al, false); } // --------------------------------------------------------------------------------------- @@ -283,58 +272,45 @@ bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { int BamMultiReaderPrivate::GetReferenceCount(void) const { // handle empty multireader - if ( m_readers.empty() ) - return 0; + if ( m_readers.empty() ) return 0; // return reference count from first reader - const ReaderAlignment& ra = m_readers.front(); - const BamReader* reader = ra.first; - if ( reader ) return reader->GetReferenceCount(); - - // invalid reader - return 0; + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return 0; + else + return reader->GetReferenceCount(); } // returns vector of reference objects const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { // handle empty multireader - if ( m_readers.empty() ) - return RefVector(); + if ( m_readers.empty() ) return RefVector(); // return reference data from first BamReader - const ReaderAlignment& ra = m_readers.front(); - const BamReader* reader = ra.first; - if ( reader ) return reader->GetReferenceData(); - - // invalid reader - return RefVector(); + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return RefVector(); + else + return reader->GetReferenceData(); } // returns refID from reference name int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { // handle empty multireader - if ( m_readers.empty() ) - return -1; + if ( m_readers.empty() ) return -1; // return reference ID from first BamReader - const ReaderAlignment& ra = m_readers.front(); - const BamReader* reader = ra.first; - if ( reader ) return reader->GetReferenceID(refName); - - // invalid reader - return -1; + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return -1; + else + return reader->GetReferenceID(refName); } // --------------------------------------------------------------------------------------- -// checks if any readers still have alignments -bool BamMultiReaderPrivate::HasAlignmentData(void) const { - if ( m_alignments == 0 ) - return false; - return !m_alignments->IsEmpty(); -} - // returns true if all readers have index data available // this is useful to indicate whether Jump() or SetRegion() are possible bool BamMultiReaderPrivate::HasIndexes(void) const { @@ -346,10 +322,11 @@ bool BamMultiReaderPrivate::HasIndexes(void) const { bool result = true; // iterate over readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; if ( reader == 0 ) continue; // see if current reader has index data @@ -363,10 +340,11 @@ bool BamMultiReaderPrivate::HasIndexes(void) const { bool BamMultiReaderPrivate::HasOpenReaders(void) { // iterate over readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; if ( reader == 0 ) continue; // return true whenever an open reader is found @@ -386,102 +364,127 @@ bool BamMultiReaderPrivate::Jump(int refID, int position) { // UpdateAlignments(), and continue. // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; - // attempt jump() on each - if ( !reader->Jump(refID, position) ) { - cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() - << " to " << refID << ":" << position << endl; - } + // jump in each BamReader to position of interest + reader->Jump(refID, position); } - // update alignment cache & return success - UpdateAlignmentCache(); - return true; -} - -bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al) { - - // bail out if no more data to process - if ( !HasAlignmentData() ) - return false; - - // "pop" next alignment and reader - ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst(); - BamReader* reader = nextReaderAlignment.first; - BamAlignment* alignment = nextReaderAlignment.second; - - // store cached alignment into destination parameter (by copy) - al = *alignment; - - // peek to next alignment & store in cache - SaveNextAlignment(reader, alignment); - - // return success - return true; + // returns status of cache update + return UpdateAlignmentCache(); } // locate (& load) index files for BAM readers that don't already have one loaded bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) { - bool result = true; + bool errorsEncountered = false; + m_errorString.clear(); // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; // if reader has no index, try to locate one - if ( !reader->HasIndex() ) - result &= reader->LocateIndex(preferredType); + if ( !reader->HasIndex() ) { + if ( !reader->LocateIndex(preferredType) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } } - return result; + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while locating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::LocatingIndexes", message); + return false; + } else + return true; } // opens BAM files bool BamMultiReaderPrivate::Open(const vector& filenames) { - // create alignment cache if neccessary - if ( m_alignments == 0 ) { - m_alignments = CreateMergerForCurrentSortOrder(); - if ( m_alignments == 0 ) return false; + m_errorString.clear(); + + // put all current readers back at beginning (refreshes alignment cache) + if ( !Rewind() ) { + const string currentError = m_errorString; + const string message = string("unable to rewind existing readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; } // iterate over filenames + bool errorsEncountered = false; vector::const_iterator filenameIter = filenames.begin(); vector::const_iterator filenameEnd = filenames.end(); for ( ; filenameIter != filenameEnd; ++filenameIter ) { const string& filename = (*filenameIter); if ( filename.empty() ) continue; - // attempt to open BamReader on filename - BamReader* reader = OpenReader(filename); - if ( reader == 0 ) continue; + // attempt to open BamReader + BamReader* reader = new BamReader; + const bool readerOpened = reader->Open(filename); + + // if opened OK, store it + if ( readerOpened ) + m_readers.push_back( MergeItem(reader, new BamAlignment) ); - // store reader with new alignment - m_readers.push_back( make_pair(reader, new BamAlignment) ); + // otherwise store error & clean up invalid reader + else { + m_errorString.append(1, '\t'); + m_errorString += string("unable to open file: ") + filename; + m_errorString.append(1, '\n'); + errorsEncountered = true; + + delete reader; + reader = 0; + } } - // validate & rewind any opened readers, also refreshes alignment cache - if ( !m_readers.empty() ) { - ValidateReaders(); - Rewind(); + // check for errors while opening + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("unable to open all files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; } - // return success - return true; + // check for BAM file consistency + if ( !ValidateReaders() ) { + const string currentError = m_errorString; + const string message = string("unable to open inconsistent files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } + + // update alignment cache + return UpdateAlignmentCache(); } bool BamMultiReaderPrivate::OpenFile(const std::string& filename) { vector filenames(1, filename); - return Open(filenames); + if ( Open(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("could not open file: ") + filename + "\n\t" + currentError; + SetErrorString("BamMultiReader::OpenFile", message); + return false; + } } bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { @@ -491,24 +494,33 @@ bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { // first reader without an index? // make sure same number of index filenames as readers - if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() ) + if ( m_readers.size() != indexFilenames.size() ) { + const string message("size of index file list does not match current BAM file count"); + SetErrorString("BamMultiReader::OpenIndexes", message); return false; + } - // init result flag - bool result = true; + bool errorsEncountered = false; + m_errorString.clear(); // iterate over BamReaders vector::const_iterator indexFilenameIter = indexFilenames.begin(); vector::const_iterator indexFilenameEnd = indexFilenames.end(); - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; // open index filename on reader if ( reader ) { const string& indexFilename = (*indexFilenameIter); - result &= reader->OpenIndex(indexFilename); + if ( !reader->OpenIndex(indexFilename) ) { + m_errorString.append(1, '\t'); + m_errorString += reader->GetErrorString(); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } } // increment filename iterator, skip if no more index files to open @@ -516,113 +528,110 @@ bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { break; } - // TODO: validation ?? - // return success/fail - return result; + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("could not open all index files: \n\t") + currentError; + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } else + return true; } -BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) { - - // create new BamReader - BamReader* reader = new BamReader; +bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) { - // if reader opens OK - if ( reader->Open(filename) ) { - - // attempt to read first alignment (sanity check) - // if ok, then return BamReader pointer - BamAlignment al; - if ( reader->GetNextAlignmentCore(al) ) - return reader; + // skip if no alignments available + if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() ) + return false; - // could not read alignment - else { - cerr << "BamMultiReader WARNING: Could not read first alignment from " - << filename << ", ignoring file" << endl; - } - } + // pop next merge item entry from cache + MergeItem item = m_alignmentCache->TakeFirst(); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; + if ( reader == 0 || alignment == 0 ) + return false; - // reader could not open - else { - cerr << "BamMultiReader WARNING: Could not open: " - << filename << ", ignoring file" << endl; + // set char data if requested + if ( needCharData ) { + alignment->BuildCharData(); + alignment->Filename = reader->GetFilename(); } - // if we get here, there was a problem with this BAM file (opening or reading) - // clean up memory allocation & return null pointer - delete reader; - return 0; -} + // store cached alignment into destination parameter (by copy) + al = *alignment; -// print associated filenames to stdout -void BamMultiReaderPrivate::PrintFilenames(void) const { - const vector& filenames = Filenames(); - vector::const_iterator filenameIter = filenames.begin(); - vector::const_iterator filenameEnd = filenames.end(); - for ( ; filenameIter != filenameEnd; ++filenameIter ) - cout << (*filenameIter) << endl; + // load next alignment from reader & store in cache + SaveNextAlignment(reader, alignment); + return true; } // returns BAM file pointers to beginning of alignment data & resets alignment cache bool BamMultiReaderPrivate::Rewind(void) { - // clear out alignment cache - m_alignments->Clear(); - // attempt to rewind files if ( !RewindReaders() ) { - cerr << "BamMultiReader ERROR: could not rewind file(s) successfully"; + const string currentError = m_errorString; + const string message = string("could not rewind readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Rewind", message); return false; } - // reset cache & return success - UpdateAlignmentCache(); - return true; + // return status of cache update + return UpdateAlignmentCache(); } // returns BAM file pointers to beginning of alignment data bool BamMultiReaderPrivate::RewindReaders(void) { - bool result = true; + m_errorString.clear(); + bool errorsEncountered = false; // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; // attempt rewind on BamReader - result &= reader->Rewind(); + if ( !reader->Rewind() ) { + m_errorString.append(1, '\t'); + m_errorString.append( reader->GetErrorString() ); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } } - return result; + return !errorsEncountered; } void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) { - // must be in core mode && NOT sorting by read name to call GNACore() - if ( m_isCoreMode && m_sortOrder != BamMultiReader::SortedByReadName ) { - if ( reader->GetNextAlignmentCore(*alignment) ) - m_alignments->Add( make_pair(reader, alignment) ); - } + // if can read alignment from reader, store in cache + // + // N.B. - lazy building of alignment's char data - populated only: + // automatically by alignment cache to maintain its sorting OR + // on demand from client call to future call to GetNextAlignment() - // not in core mode and/or sorting by readname, must call GNA() - else { - if ( reader->GetNextAlignment(*alignment) ) - m_alignments->Add( make_pair(reader, alignment) ); - } + if ( reader->GetNextAlignmentCore(*alignment) ) + m_alignmentCache->Add( MergeItem(reader, alignment) ); +} + +void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const { + static const string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; } // sets the index caching mode on the readers void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) { // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; // set reader's index cache mode @@ -638,112 +647,106 @@ bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { // UpdateAlignments(), and continue. // iterate over alignments - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; - // attempt to set BamReader's region of interest - if ( !reader->SetRegion(region) ) { - cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() << " to " - << region.LeftRefID << ":" << region.LeftPosition << ".." - << region.RightRefID << ":" << region.RightPosition << endl; - } - } - - // update alignment cache & return success - UpdateAlignmentCache(); - return true; -} - -void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) { - - // skip if no change needed - if ( m_sortOrder == order ) return; - - // set new sort order - m_sortOrder = order; - - // create new alignment cache based on sort order - IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder(); - if ( newAlignmentCache == 0 ) return; // print error? - - // copy old cache contents to new cache - while ( m_alignments->Size() > 0 ) { - ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops' - newAlignmentCache->Add(value); + // set region of interest + reader->SetRegion(region); } - // remove old cache structure & point to new cache - delete m_alignments; - m_alignments = newAlignmentCache; -} - -// splits the entire header into a list of strings -const vector BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const { - - stringstream header(headerText); - string item; - - vector lines; - while ( getline(header, item) ) - lines.push_back(item); - return lines; + // return status of cache update + return UpdateAlignmentCache(); } // updates our alignment cache -void BamMultiReaderPrivate::UpdateAlignmentCache(void) { - - // skip if invalid alignment cache - if ( m_alignments == 0 ) return; - - // clear the cache - m_alignments->Clear(); +bool BamMultiReaderPrivate::UpdateAlignmentCache(void) { + + // create alignment cache if not created yet + if ( m_alignmentCache == 0 ) { + m_alignmentCache = CreateAlignmentCache(); + if ( m_alignmentCache == 0 ) { + SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache"); + return false; + } + } - // seed cache with fully-populated alignments - // further updates will fill with full/core-only as requested - m_isCoreMode = false; + // clear any prior cache data + m_alignmentCache->Clear(); // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - BamReader* reader = (*readerIter).first; - BamAlignment* alignment = (*readerIter).second; + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; if ( reader == 0 || alignment == 0 ) continue; // save next alignment from each reader in cache SaveNextAlignment(reader, alignment); } + + // if we get here, ok + return true; } // ValidateReaders checks that all the readers point to BAM files representing // alignments against the same set of reference sequences, and that the // sequences are identically ordered. If these checks fail the operation of // the multireader is undefined, so we force program exit. -void BamMultiReaderPrivate::ValidateReaders(void) const { +bool BamMultiReaderPrivate::ValidateReaders(void) const { + + m_errorString.clear(); - // retrieve first reader data - const BamReader* firstReader = m_readers.front().first; - if ( firstReader == 0 ) return; - const RefVector firstReaderRefData = firstReader->GetReferenceData(); + // skip if 0 or 1 readers opened + if ( m_readers.empty() || (m_readers.size() == 1) ) + return true; + + // retrieve first reader + const MergeItem& firstItem = m_readers.front(); + const BamReader* firstReader = firstItem.Reader; + if ( firstReader == 0 ) return false; + + // retrieve first reader's header data + const SamHeader& firstReaderHeader = firstReader->GetHeader(); + const string& firstReaderSortOrder = firstReaderHeader.SortOrder; + + // retrieve first reader's reference data + const RefVector& firstReaderRefData = firstReader->GetReferenceData(); const int firstReaderRefCount = firstReader->GetReferenceCount(); const int firstReaderRefSize = firstReaderRefData.size(); // iterate over all readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); for ( ; readerIter != readerEnd; ++readerIter ) { - - // get current reader data - BamReader* reader = (*readerIter).first; + const MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; if ( reader == 0 ) continue; + + // get current reader's header data + const SamHeader& currentReaderHeader = reader->GetHeader(); + const string& currentReaderSortOrder = currentReaderHeader.SortOrder; + + // check compatible sort order + if ( currentReaderSortOrder != firstReaderSortOrder ) { + const string message = string("mismatched sort order in ") + reader->GetFilename() + + ", expected " + firstReaderSortOrder + + ", but found " + currentReaderSortOrder; + SetErrorString("BamMultiReader::ValidateReaders", message); + return false; + } + + // get current reader's reference data const RefVector currentReaderRefData = reader->GetReferenceData(); const int currentReaderRefCount = reader->GetReferenceCount(); const int currentReaderRefSize = currentReaderRefData.size(); - // init container iterators + // init reference data iterators RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); @@ -752,10 +755,12 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { if ( (currentReaderRefCount != firstReaderRefCount) || (firstReaderRefSize != currentReaderRefSize) ) { - cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstReaderRefCount - << " reference sequences but only found " << currentReaderRefCount << endl; - exit(1); + stringstream s(""); + s << "mismatched reference count in " << reader->GetFilename() + << ", expected " << firstReaderRefCount + << ", but found " << currentReaderRefCount; + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; } // this will be ok; we just checked above that we have identically-sized sets of references @@ -768,28 +773,31 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { if ( (firstRef.RefName != currentRef.RefName) || (firstRef.RefLength != currentRef.RefLength) ) { - cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename() - << " expected: " << endl; + stringstream s(""); + s << "mismatched references found in" << reader->GetFilename() + << "expected: " << endl; // print first reader's reference data RefVector::const_iterator refIter = firstReaderRefData.begin(); RefVector::const_iterator refEnd = firstReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { const RefData& entry = (*refIter); - cerr << entry.RefName << " " << entry.RefLength << endl; + stringstream s(""); + s << entry.RefName << " " << endl; } - cerr << "but found: " << endl; + s << "but found: " << endl; // print current reader's reference data refIter = currentReaderRefData.begin(); refEnd = currentReaderRefData.end(); for ( ; refIter != refEnd; ++refIter ) { const RefData& entry = (*refIter); - cerr << entry.RefName << " " << entry.RefLength << endl; + s << entry.RefName << " " << entry.RefLength << endl; } - exit(1); + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; } // update iterators @@ -797,4 +805,7 @@ void BamMultiReaderPrivate::ValidateReaders(void) const { ++currentRefIter; } } + + // if we get here, everything checks out + return true; }