// ***************************************************************************
// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 21 March 2011 (DB)
+// Last modified: 14 October 2011 (DB)
// ---------------------------------------------------------------------------
// Functionality for simultaneously reading multiple BAM files
// *************************************************************************
-#include <api/BamAlignment.h>
-#include <api/BamMultiReader.h>
-#include <api/internal/BamMultiMerger_p.h>
-#include <api/internal/BamMultiReader_p.h>
+#include "api/BamAlignment.h"
+#include "api/BamMultiReader.h"
+#include "api/SamConstants.h"
+#include "api/algorithms/Sort.h"
+#include "api/internal/BamMultiReader_p.h"
using namespace BamTools;
using namespace BamTools::Internal;
// ctor
BamMultiReaderPrivate::BamMultiReaderPrivate(void)
- : m_alignments(0)
- , m_isCoreMode(false)
- , m_sortOrder(BamMultiReader::SortedByPosition)
+ : m_alignmentCache(0)
{ }
// dtor
BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
-
- // close all open BAM readers
Close();
-
- // clean up alignment cache
- delete m_alignments;
- m_alignments = 0;
}
// close all BAM files
-void BamMultiReaderPrivate::Close(void) {
- CloseFiles( Filenames() );
+bool BamMultiReaderPrivate::Close(void) {
+
+ m_errorString.clear();
+
+ if ( CloseFiles(Filenames()) )
+ return true;
+ else {
+ const string currentError = m_errorString;
+ const string message = string("error encountered while closing all files: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Close", message);
+ return false;
+ }
}
// close requested BAM file
-void BamMultiReaderPrivate::CloseFile(const string& filename) {
+bool BamMultiReaderPrivate::CloseFile(const string& filename) {
+
+ m_errorString.clear();
+
vector<string> filenames(1, filename);
- CloseFiles(filenames);
+ if ( CloseFiles(filenames) )
+ return true;
+ else {
+ const string currentError = m_errorString;
+ const string message = string("error while closing file: ") + filename + "\n" + currentError;
+ SetErrorString("BamMultiReader::CloseFile", message);
+ return false;
+ }
}
// close requested BAM files
-void BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
+bool BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
// iterate over filenames
vector<string>::const_iterator filesIter = filenames.begin();
if ( filename.empty() ) continue;
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// if reader matches requested filename
if ( reader->GetFilename() == filename ) {
- // remove reader/alignment from alignment cache
- m_alignments->Remove(reader);
+ // remove reader's entry from alignment cache
+ m_alignmentCache->Remove(reader);
- // close & delete reader
- reader->Close();
+ // clean up reader & its alignment
+ if ( !reader->Close() ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
delete reader;
reader = 0;
// delete reader's alignment entry
- BamAlignment* alignment = (*readerIter).second;
+ BamAlignment* alignment = item.Alignment;
delete alignment;
alignment = 0;
- // remove reader from container
+ // remove reader from reader list
m_readers.erase(readerIter);
// on match, just go on to next filename
- // (no need to keep looking and iterator is invalid now anyway)
+ // (no need to keep looking and item iterator is invalid now anyway)
break;
}
}
}
- // make sure alignment cache is cleared if all readers are now closed
- if ( m_readers.empty() && m_alignments != 0 )
- m_alignments->Clear();
+ // make sure alignment cache is cleaned up if all readers closed
+ if ( m_readers.empty() && m_alignmentCache ) {
+ m_alignmentCache->Clear();
+ delete m_alignmentCache;
+ m_alignmentCache = 0;
+ }
+
+ // return whether all readers closed OK
+ return !errorsEncountered;
}
// creates index files for BAM files that don't have them
bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) {
- bool result = true;
+ bool errorsEncountered = false;
+ m_errorString.clear();
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ vector<MergeItem>::iterator itemIter = m_readers.begin();
+ vector<MergeItem>::iterator itemEnd = m_readers.end();
+ for ( ; itemIter != itemEnd; ++itemIter ) {
+ MergeItem& item = (*itemIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// if reader doesn't have an index, create one
- if ( !reader->HasIndex() )
- result &= reader->CreateIndex(type);
+ if ( !reader->HasIndex() ) {
+ if ( !reader->CreateIndex(type) ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
}
- return result;
+ // check for errors encountered before returning success/fail
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("error while creating index files: ") + "\n" + currentError;
+ SetErrorString("BamMultiReader::CreateIndexes", message);
+ return false;
+ } else
+ return true;
}
-IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const {
- switch ( m_sortOrder ) {
- case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger;
- case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger;
- case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger;
- default :
- cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl;
- return 0;
- }
-}
+IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const {
-const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const {
+ // fetch SamHeader
+ SamHeader header = GetHeader();
- string readGroup("");
- stringstream headerLineSs(headerLine);
- string part;
+ // if BAM files are sorted by position
+ if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE )
+ return new MultiMerger<Algorithms::Sort::ByPosition>();
- // parse @RG header line, looking for the ID: tag
- while( getline(headerLineSs, part, '\t') ) {
- stringstream partSs(part);
- string subtag;
- getline(partSs, subtag, ':');
- if ( subtag == "ID" ) {
- getline(partSs, readGroup, ':');
- break;
- }
- }
- return readGroup;
+ // if BAM files are sorted by read name
+ if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME )
+ return new MultiMerger<Algorithms::Sort::ByName>();
+
+ // otherwise "unknown" or "unsorted", use unsorted merger and just read in
+ return new MultiMerger<Algorithms::Sort::Unsorted>();
}
const vector<string> BamMultiReaderPrivate::Filenames(void) const {
filenames.reserve( m_readers.size() );
// iterate over readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ vector<MergeItem>::const_iterator itemIter = m_readers.begin();
+ vector<MergeItem>::const_iterator itemEnd = m_readers.end();
+ for ( ; itemIter != itemEnd; ++itemIter ) {
+ const MergeItem& item = (*itemIter);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// store filename if not empty
- const string filename = reader->GetFilename();
+ const string& filename = reader->GetFilename();
if ( !filename.empty() )
- filenames.push_back( reader->GetFilename() );
+ filenames.push_back(filename);
}
// return result
return filenames;
}
+string BamMultiReaderPrivate::GetErrorString(void) const {
+ return m_errorString;
+}
+
SamHeader BamMultiReaderPrivate::GetHeader(void) const {
- string text = GetHeaderText();
+ const string& text = GetHeaderText();
return SamHeader(text);
}
// makes a virtual, unified header for all the bam files in the multireader
string BamMultiReaderPrivate::GetHeaderText(void) const {
- // if only one reader is open
- if ( m_readers.size() == 1 ) {
+ // N.B. - right now, simply copies all header data from first BAM,
+ // and then appends RG's from other BAM files
+ // TODO: make this more intelligent wrt other header lines/fields
- // just return reader's header text
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
- if ( reader ) return reader->GetHeaderText();
-
- // invalid reader
- return string();
- }
+ // if no readers open
+ const size_t numReaders = m_readers.size();
+ if ( numReaders == 0 ) return string();
- string mergedHeader("");
- map<string, bool> readGroups;
+ // retrieve first reader's header
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* reader = firstItem.Reader;
+ if ( reader == 0 ) return string();
+ SamHeader mergedHeader = reader->GetHeader();
- // foreach extraction entry (each BAM file)
- vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerIter = readerBegin;
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ // iterate over any remaining readers (skipping the first)
+ for ( size_t i = 1; i < numReaders; ++i ) {
+ const MergeItem& item = m_readers.at(i);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
- // get header from reader
- string headerText = reader->GetHeaderText();
- if ( headerText.empty() ) continue;
-
- // store header text in lines
- map<string, bool> currentFileReadGroups;
- const vector<string> lines = SplitHeaderText(headerText);
-
- // iterate over header lines
- vector<string>::const_iterator linesIter = lines.begin();
- vector<string>::const_iterator linesEnd = lines.end();
- for ( ; linesIter != linesEnd; ++linesIter ) {
-
- // get next line from header, skip if empty
- const string headerLine = (*linesIter);
- if ( headerLine.empty() ) continue;
-
- // if first file, save HD & SQ entries
- // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ?
- if ( readerIter == readerBegin ) {
- if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
- mergedHeader.append(headerLine.c_str());
- mergedHeader.append(1, '\n');
- }
- }
+ // retrieve current reader's header
+ const SamHeader currentHeader = reader->GetHeader();
- // (for all files) append RG entries if they are unique
- if ( headerLine.find("@RG") == 0 ) {
-
- // extract read group name from line
- const string readGroup = ExtractReadGroup(headerLine);
-
- // make sure not to duplicate @RG entries
- if ( readGroups.find(readGroup) == readGroups.end() ) {
- mergedHeader.append(headerLine.c_str() );
- mergedHeader.append(1, '\n');
- readGroups[readGroup] = true;
- currentFileReadGroups[readGroup] = true;
- } else {
- // warn iff we are reading one file and discover duplicated @RG tags in the header
- // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
- if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) {
- cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup
- << " entry in header of " << reader->GetFilename() << endl;
- }
- }
- }
- }
+ // append current reader's RG entries to merged header
+ // N.B. - SamReadGroupDictionary handles duplicate-checking
+ mergedHeader.ReadGroups.Add(currentHeader.ReadGroups);
+
+ // TODO: merge anything else??
}
- // return merged header text
- return mergedHeader;
+ // return stringified header
+ return mergedHeader.ToString();
}
// get next alignment among all files
bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
- m_isCoreMode = false;
- return LoadNextAlignment(al);
+ return PopNextCachedAlignment(al, true);
}
// get next alignment among all files without parsing character data from alignments
bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
- m_isCoreMode = true;
- return LoadNextAlignment(al);
+ return PopNextCachedAlignment(al, false);
}
// ---------------------------------------------------------------------------------------
int BamMultiReaderPrivate::GetReferenceCount(void) const {
// handle empty multireader
- if ( m_readers.empty() )
- return 0;
+ if ( m_readers.empty() ) return 0;
// return reference count from first reader
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
- if ( reader ) return reader->GetReferenceCount();
-
- // invalid reader
- return 0;
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) return 0;
+ else
+ return reader->GetReferenceCount();
}
// returns vector of reference objects
const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
// handle empty multireader
- if ( m_readers.empty() )
- return RefVector();
+ if ( m_readers.empty() ) return RefVector();
// return reference data from first BamReader
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
- if ( reader ) return reader->GetReferenceData();
-
- // invalid reader
- return RefVector();
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) return RefVector();
+ else
+ return reader->GetReferenceData();
}
// returns refID from reference name
int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
// handle empty multireader
- if ( m_readers.empty() )
- return -1;
+ if ( m_readers.empty() ) return -1;
// return reference ID from first BamReader
- const ReaderAlignment& ra = m_readers.front();
- const BamReader* reader = ra.first;
- if ( reader ) return reader->GetReferenceID(refName);
-
- // invalid reader
- return -1;
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) return -1;
+ else
+ return reader->GetReferenceID(refName);
}
// ---------------------------------------------------------------------------------------
-// checks if any readers still have alignments
-bool BamMultiReaderPrivate::HasAlignmentData(void) const {
- if ( m_alignments == 0 )
- return false;
- return !m_alignments->IsEmpty();
-}
-
// returns true if all readers have index data available
// this is useful to indicate whether Jump() or SetRegion() are possible
bool BamMultiReaderPrivate::HasIndexes(void) const {
bool result = true;
// iterate over readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// see if current reader has index data
bool BamMultiReaderPrivate::HasOpenReaders(void) {
// iterate over readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- const BamReader* reader = (*readerIter).first;
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// return true whenever an open reader is found
// UpdateAlignments(), and continue.
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
- // attempt jump() on each
- if ( !reader->Jump(refID, position) ) {
- cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename()
- << " to " << refID << ":" << position << endl;
- }
+ // jump in each BamReader to position of interest
+ reader->Jump(refID, position);
}
- // update alignment cache & return success
- UpdateAlignmentCache();
- return true;
-}
-
-bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al) {
-
- // bail out if no more data to process
- if ( !HasAlignmentData() )
- return false;
-
- // "pop" next alignment and reader
- ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst();
- BamReader* reader = nextReaderAlignment.first;
- BamAlignment* alignment = nextReaderAlignment.second;
-
- // store cached alignment into destination parameter (by copy)
- al = *alignment;
-
- // peek to next alignment & store in cache
- SaveNextAlignment(reader, alignment);
-
- // return success
- return true;
+ // returns status of cache update
+ return UpdateAlignmentCache();
}
// locate (& load) index files for BAM readers that don't already have one loaded
bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) {
- bool result = true;
+ bool errorsEncountered = false;
+ m_errorString.clear();
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// if reader has no index, try to locate one
- if ( !reader->HasIndex() )
- result &= reader->LocateIndex(preferredType);
+ if ( !reader->HasIndex() ) {
+ if ( !reader->LocateIndex(preferredType) ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
}
- return result;
+ // check for errors encountered before returning success/fail
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("error while locating index files: ") + "\n" + currentError;
+ SetErrorString("BamMultiReader::LocatingIndexes", message);
+ return false;
+ } else
+ return true;
}
// opens BAM files
bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
- // create alignment cache if neccessary
- if ( m_alignments == 0 ) {
- m_alignments = CreateMergerForCurrentSortOrder();
- if ( m_alignments == 0 ) return false;
+ m_errorString.clear();
+
+ // put all current readers back at beginning (refreshes alignment cache)
+ if ( !Rewind() ) {
+ const string currentError = m_errorString;
+ const string message = string("unable to rewind existing readers: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
}
// iterate over filenames
+ bool errorsEncountered = false;
vector<string>::const_iterator filenameIter = filenames.begin();
vector<string>::const_iterator filenameEnd = filenames.end();
for ( ; filenameIter != filenameEnd; ++filenameIter ) {
const string& filename = (*filenameIter);
if ( filename.empty() ) continue;
- // attempt to open BamReader on filename
- BamReader* reader = OpenReader(filename);
- if ( reader == 0 ) continue;
+ // attempt to open BamReader
+ BamReader* reader = new BamReader;
+ const bool readerOpened = reader->Open(filename);
+
+ // if opened OK, store it
+ if ( readerOpened )
+ m_readers.push_back( MergeItem(reader, new BamAlignment) );
+
+ // otherwise store error & clean up invalid reader
+ else {
+ m_errorString.append(1, '\t');
+ m_errorString += string("unable to open file: ") + filename;
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
- // store reader with new alignment
- m_readers.push_back( make_pair(reader, new BamAlignment) );
+ delete reader;
+ reader = 0;
+ }
}
- // validate & rewind any opened readers, also refreshes alignment cache
- if ( !m_readers.empty() ) {
- ValidateReaders();
- Rewind();
+ // check for errors while opening
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("unable to open all files: \t\n") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
}
- // return success
- return true;
+ // check for BAM file consistency
+ if ( !ValidateReaders() ) {
+ const string currentError = m_errorString;
+ const string message = string("unable to open inconsistent files: \t\n") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
+ }
+
+ // update alignment cache
+ return UpdateAlignmentCache();
}
bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
vector<string> filenames(1, filename);
- return Open(filenames);
+ if ( Open(filenames) )
+ return true;
+ else {
+ const string currentError = m_errorString;
+ const string message = string("could not open file: ") + filename + "\n\t" + currentError;
+ SetErrorString("BamMultiReader::OpenFile", message);
+ return false;
+ }
}
bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) {
// first reader without an index?
// make sure same number of index filenames as readers
- if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() )
+ if ( m_readers.size() != indexFilenames.size() ) {
+ const string message("size of index file list does not match current BAM file count");
+ SetErrorString("BamMultiReader::OpenIndexes", message);
return false;
+ }
- // init result flag
- bool result = true;
+ bool errorsEncountered = false;
+ m_errorString.clear();
// iterate over BamReaders
vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
// open index filename on reader
if ( reader ) {
const string& indexFilename = (*indexFilenameIter);
- result &= reader->OpenIndex(indexFilename);
+ if ( !reader->OpenIndex(indexFilename) ) {
+ m_errorString.append(1, '\t');
+ m_errorString += reader->GetErrorString();
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
}
// increment filename iterator, skip if no more index files to open
break;
}
- // TODO: validation ??
-
// return success/fail
- return result;
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("could not open all index files: \n\t") + currentError;
+ SetErrorString("BamMultiReader::OpenIndexes", message);
+ return false;
+ } else
+ return true;
}
-BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) {
+bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
- // create new BamReader
- BamReader* reader = new BamReader;
-
- // if reader opens OK
- if ( reader->Open(filename) ) {
-
- // attempt to read first alignment (sanity check)
- // if ok, then return BamReader pointer
- BamAlignment al;
- if ( reader->GetNextAlignmentCore(al) )
- return reader;
+ // skip if no alignments available
+ if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() )
+ return false;
- // could not read alignment
- else {
- cerr << "BamMultiReader WARNING: Could not read first alignment from "
- << filename << ", ignoring file" << endl;
- }
- }
+ // pop next merge item entry from cache
+ MergeItem item = m_alignmentCache->TakeFirst();
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
+ if ( reader == 0 || alignment == 0 )
+ return false;
- // reader could not open
- else {
- cerr << "BamMultiReader WARNING: Could not open: "
- << filename << ", ignoring file" << endl;
+ // set char data if requested
+ if ( needCharData ) {
+ alignment->BuildCharData();
+ alignment->Filename = reader->GetFilename();
}
- // if we get here, there was a problem with this BAM file (opening or reading)
- // clean up memory allocation & return null pointer
- delete reader;
- return 0;
-}
+ // store cached alignment into destination parameter (by copy)
+ al = *alignment;
-// print associated filenames to stdout
-void BamMultiReaderPrivate::PrintFilenames(void) const {
- const vector<string>& filenames = Filenames();
- vector<string>::const_iterator filenameIter = filenames.begin();
- vector<string>::const_iterator filenameEnd = filenames.end();
- for ( ; filenameIter != filenameEnd; ++filenameIter )
- cout << (*filenameIter) << endl;
+ // load next alignment from reader & store in cache
+ SaveNextAlignment(reader, alignment);
+ return true;
}
// returns BAM file pointers to beginning of alignment data & resets alignment cache
bool BamMultiReaderPrivate::Rewind(void) {
- // clear out alignment cache
- m_alignments->Clear();
+ // skip if no readers open
+ if ( m_readers.empty() )
+ return true;
// attempt to rewind files
if ( !RewindReaders() ) {
- cerr << "BamMultiReader ERROR: could not rewind file(s) successfully";
+ const string currentError = m_errorString;
+ const string message = string("could not rewind readers: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Rewind", message);
return false;
}
- // reset cache & return success
- UpdateAlignmentCache();
- return true;
+ // return status of cache update
+ return UpdateAlignmentCache();
}
// returns BAM file pointers to beginning of alignment data
bool BamMultiReaderPrivate::RewindReaders(void) {
- bool result = true;
+ m_errorString.clear();
+ bool errorsEncountered = false;
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
// attempt rewind on BamReader
- result &= reader->Rewind();
+ if ( !reader->Rewind() ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append( reader->GetErrorString() );
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
}
- return result;
+ return !errorsEncountered;
}
void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
- // must be in core mode && NOT sorting by read name to call GNACore()
- if ( m_isCoreMode && m_sortOrder != BamMultiReader::SortedByReadName ) {
- if ( reader->GetNextAlignmentCore(*alignment) )
- m_alignments->Add( make_pair(reader, alignment) );
- }
+ // if can read alignment from reader, store in cache
+ //
+ // N.B. - lazy building of alignment's char data - populated only:
+ // automatically by alignment cache to maintain its sorting OR
+ // on demand from client call to future call to GetNextAlignment()
- // not in core mode and/or sorting by readname, must call GNA()
- else {
- if ( reader->GetNextAlignment(*alignment) )
- m_alignments->Add( make_pair(reader, alignment) );
- }
+ if ( reader->GetNextAlignmentCore(*alignment) )
+ m_alignmentCache->Add( MergeItem(reader, alignment) );
}
-// sets the index caching mode on the readers
-void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) {
-
- // iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
- if ( reader == 0 ) continue;
-
- // set reader's index cache mode
- reader->SetIndexCacheMode(mode);
- }
+void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const {
+ static const string SEPARATOR = ": ";
+ m_errorString = where + SEPARATOR + what;
}
bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
// UpdateAlignments(), and continue.
// iterate over alignments
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
- // attempt to set BamReader's region of interest
- if ( !reader->SetRegion(region) ) {
- cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() << " to "
- << region.LeftRefID << ":" << region.LeftPosition << ".."
- << region.RightRefID << ":" << region.RightPosition << endl;
- }
+ // set region of interest
+ reader->SetRegion(region);
}
- // update alignment cache & return success
- UpdateAlignmentCache();
- return true;
-}
-
-void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) {
-
- // skip if no change needed
- if ( m_sortOrder == order ) return;
-
- // set new sort order
- m_sortOrder = order;
-
- // create new alignment cache based on sort order
- IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder();
- if ( newAlignmentCache == 0 ) return; // print error?
-
- // copy old cache contents to new cache
- while ( m_alignments->Size() > 0 ) {
- ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops'
- newAlignmentCache->Add(value);
- }
-
- // remove old cache structure & point to new cache
- delete m_alignments;
- m_alignments = newAlignmentCache;
-}
-
-// splits the entire header into a list of strings
-const vector<string> BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const {
-
- stringstream header(headerText);
- string item;
-
- vector<string> lines;
- while ( getline(header, item) )
- lines.push_back(item);
- return lines;
+ // return status of cache update
+ return UpdateAlignmentCache();
}
// updates our alignment cache
-void BamMultiReaderPrivate::UpdateAlignmentCache(void) {
-
- // skip if invalid alignment cache
- if ( m_alignments == 0 ) return;
-
- // clear the cache
- m_alignments->Clear();
+bool BamMultiReaderPrivate::UpdateAlignmentCache(void) {
+
+ // create alignment cache if not created yet
+ if ( m_alignmentCache == 0 ) {
+ m_alignmentCache = CreateAlignmentCache();
+ if ( m_alignmentCache == 0 ) {
+ SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache");
+ return false;
+ }
+ }
- // seed cache with fully-populated alignments
- // further updates will fill with full/core-only as requested
- m_isCoreMode = false;
+ // clear any prior cache data
+ m_alignmentCache->Clear();
// iterate over readers
- vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
- BamReader* reader = (*readerIter).first;
- BamAlignment* alignment = (*readerIter).second;
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
if ( reader == 0 || alignment == 0 ) continue;
// save next alignment from each reader in cache
SaveNextAlignment(reader, alignment);
}
+
+ // if we get here, ok
+ return true;
}
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered. If these checks fail the operation of
// the multireader is undefined, so we force program exit.
-void BamMultiReaderPrivate::ValidateReaders(void) const {
+bool BamMultiReaderPrivate::ValidateReaders(void) const {
+
+ m_errorString.clear();
- // retrieve first reader data
- const BamReader* firstReader = m_readers.front().first;
- if ( firstReader == 0 ) return;
- const RefVector firstReaderRefData = firstReader->GetReferenceData();
+ // skip if 0 or 1 readers opened
+ if ( m_readers.empty() || (m_readers.size() == 1) )
+ return true;
+
+ // retrieve first reader
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* firstReader = firstItem.Reader;
+ if ( firstReader == 0 ) return false;
+
+ // retrieve first reader's header data
+ const SamHeader& firstReaderHeader = firstReader->GetHeader();
+ const string& firstReaderSortOrder = firstReaderHeader.SortOrder;
+
+ // retrieve first reader's reference data
+ const RefVector& firstReaderRefData = firstReader->GetReferenceData();
const int firstReaderRefCount = firstReader->GetReferenceCount();
const int firstReaderRefSize = firstReaderRefData.size();
// iterate over all readers
- vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
- vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
-
- // get current reader data
- BamReader* reader = (*readerIter).first;
+ const MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
if ( reader == 0 ) continue;
+
+ // get current reader's header data
+ const SamHeader& currentReaderHeader = reader->GetHeader();
+ const string& currentReaderSortOrder = currentReaderHeader.SortOrder;
+
+ // check compatible sort order
+ if ( currentReaderSortOrder != firstReaderSortOrder ) {
+ const string message = string("mismatched sort order in ") + reader->GetFilename() +
+ ", expected " + firstReaderSortOrder +
+ ", but found " + currentReaderSortOrder;
+ SetErrorString("BamMultiReader::ValidateReaders", message);
+ return false;
+ }
+
+ // get current reader's reference data
const RefVector currentReaderRefData = reader->GetReferenceData();
const int currentReaderRefCount = reader->GetReferenceCount();
const int currentReaderRefSize = currentReaderRefData.size();
- // init container iterators
+ // init reference data iterators
RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
if ( (currentReaderRefCount != firstReaderRefCount) ||
(firstReaderRefSize != currentReaderRefSize) )
{
- cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename()
- << " expected " << firstReaderRefCount
- << " reference sequences but only found " << currentReaderRefCount << endl;
- exit(1);
+ stringstream s("");
+ s << "mismatched reference count in " << reader->GetFilename()
+ << ", expected " << firstReaderRefCount
+ << ", but found " << currentReaderRefCount;
+ SetErrorString("BamMultiReader::ValidateReaders", s.str());
+ return false;
}
// this will be ok; we just checked above that we have identically-sized sets of references
if ( (firstRef.RefName != currentRef.RefName) ||
(firstRef.RefLength != currentRef.RefLength) )
{
- cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename()
- << " expected: " << endl;
+ stringstream s("");
+ s << "mismatched references found in" << reader->GetFilename()
+ << "expected: " << endl;
// print first reader's reference data
RefVector::const_iterator refIter = firstReaderRefData.begin();
RefVector::const_iterator refEnd = firstReaderRefData.end();
for ( ; refIter != refEnd; ++refIter ) {
const RefData& entry = (*refIter);
- cerr << entry.RefName << " " << entry.RefLength << endl;
+ stringstream s("");
+ s << entry.RefName << " " << endl;
}
- cerr << "but found: " << endl;
+ s << "but found: " << endl;
// print current reader's reference data
refIter = currentReaderRefData.begin();
refEnd = currentReaderRefData.end();
for ( ; refIter != refEnd; ++refIter ) {
const RefData& entry = (*refIter);
- cerr << entry.RefName << " " << entry.RefLength << endl;
+ s << entry.RefName << " " << entry.RefLength << endl;
}
- exit(1);
+ SetErrorString("BamMultiReader::ValidateReaders", s.str());
+ return false;
}
// update iterators
++currentRefIter;
}
}
+
+ // if we get here, everything checks out
+ return true;
}