1 // ***************************************************************************
2 // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 17 January 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Functionality for simultaneously reading multiple BAM files
9 // *************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/BamMultiReader.h>
13 #include <api/internal/BamMultiMerger_p.h>
14 #include <api/internal/BamMultiReader_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
26 BamMultiReaderPrivate::BamMultiReaderPrivate(void)
29 , m_sortOrder(BamMultiReader::SortedByPosition)
33 BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
35 // close all open BAM readers
38 // clean up alignment cache
43 // close the BAM files
44 void BamMultiReaderPrivate::Close(void) {
46 // clear out alignment cache
47 m_alignments->Clear();
49 // iterate over readers
50 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
51 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
52 for ( ; readerIter != readerEnd; ++readerIter ) {
55 BamReader* reader = (*readerIter).first;
56 BamAlignment* alignment = (*readerIter).second;
57 if ( reader ) reader->Close();
69 // reset default flags
71 m_sortOrder = BamMultiReader::SortedByPosition;
74 // saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
75 bool BamMultiReaderPrivate::CreateIndexes(bool useStandardIndex) {
78 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
79 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
80 for ( ; readerIter != readerEnd; ++readerIter ) {
81 BamReader* reader = (*readerIter).first;
82 result &= reader->CreateIndex(useStandardIndex);
87 IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const {
88 switch ( m_sortOrder ) {
89 case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger;
90 case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger;
91 case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger;
92 default : //print error
97 const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const {
100 stringstream headerLineSs(headerLine);
103 // parse @RG header line, looking for the ID: tag
104 while( getline(headerLineSs, part, '\t') ) {
105 stringstream partSs(part);
107 getline(partSs, subtag, ':');
108 if ( subtag == "ID" ) {
109 getline(partSs, readGroup, ':');
116 // makes a virtual, unified header for all the bam files in the multireader
117 const string BamMultiReaderPrivate::GetHeaderText(void) const {
119 // just spit single header out if only have one reader open
120 if ( m_readers.size() == 1 ) {
123 vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin();
124 const ReaderAlignment& entry = (*readerBegin);
125 const BamReader* reader = entry.first;
126 if ( reader == 0 ) return "";
127 return reader->GetHeaderText();
130 string mergedHeader = "";
131 map<string, bool> readGroups;
133 // foreach extraction entry (each BAM file)
134 vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin();
135 vector<ReaderAlignment>::const_iterator readerIter = readerBegin;
136 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
137 for ( ; readerIter != readerEnd; ++readerIter ) {
139 // get header from reader
140 const BamReader* reader = (*readerIter).first;
141 if ( reader == 0 ) continue;
142 string headerText = reader->GetHeaderText();
143 if ( headerText.empty() ) continue;
145 // store header text in lines
146 map<string, bool> currentFileReadGroups;
147 const vector<string> lines = SplitHeaderText(headerText);
149 // iterate over header lines
150 vector<string>::const_iterator linesIter = lines.begin();
151 vector<string>::const_iterator linesEnd = lines.end();
152 for ( ; linesIter != linesEnd; ++linesIter ) {
154 // get next line from header, skip if empty
155 const string headerLine = (*linesIter);
156 if ( headerLine.empty() ) { continue; }
158 // if first file, save HD & SQ entries
159 if ( readerIter == readerBegin ) {
160 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
161 mergedHeader.append(headerLine.c_str());
162 mergedHeader.append(1, '\n');
166 // (for all files) append RG entries if they are unique
167 if ( headerLine.find("@RG") == 0 ) {
169 // extract read group name from line
170 const string readGroup = ExtractReadGroup(headerLine);
172 // make sure not to duplicate @RG entries
173 if ( readGroups.find(readGroup) == readGroups.end() ) {
174 mergedHeader.append(headerLine.c_str() );
175 mergedHeader.append(1, '\n');
176 readGroups[readGroup] = true;
177 currentFileReadGroups[readGroup] = true;
179 // warn iff we are reading one file and discover duplicated @RG tags in the header
180 // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
181 if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) {
182 cerr << "WARNING: duplicate @RG tag " << readGroup
183 << " entry in header of " << reader->GetFilename() << endl;
190 // return merged header text
194 // get next alignment among all files
195 bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
196 return LoadNextAlignment(al, false);
199 // get next alignment among all files without parsing character data from alignments
200 bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
201 return LoadNextAlignment(al, true);
204 // ---------------------------------------------------------------------------------------
206 // NB: The following GetReferenceX() functions assume that we have identical
207 // references for all BAM files. We enforce this by invoking the above
208 // validation function (ValidateReaders) to verify that our reference data
209 // is the same across all files on Open, so we will not encounter a situation
210 // in which there is a mismatch and we are still live.
212 // ---------------------------------------------------------------------------------------
214 // returns the number of reference sequences
215 const int BamMultiReaderPrivate::GetReferenceCount(void) const {
216 const ReaderAlignment& firstReader = m_readers.front();
217 const BamReader* reader = firstReader.first;
218 if ( reader ) return reader->GetReferenceCount();
222 // returns vector of reference objects
223 const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
224 const ReaderAlignment& firstReader = m_readers.front();
225 const BamReader* reader = firstReader.first;
226 if ( reader ) return reader->GetReferenceData();
227 else return RefVector();
230 // returns refID from reference name
231 const int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
232 const ReaderAlignment& firstReader = m_readers.front();
233 const BamReader* reader = firstReader.first;
234 if ( reader ) return reader->GetReferenceID(refName);
235 else return -1; // ERROR case - how to report
238 // ---------------------------------------------------------------------------------------
240 // checks if any readers still have alignments
241 bool BamMultiReaderPrivate::HasOpenReaders(void) {
242 return ( m_alignments->Size() > 0 );
245 // returns whether underlying BAM readers ALL have an index loaded
246 // this is useful to indicate whether Jump() or SetRegion() are possible
247 bool BamMultiReaderPrivate::IsIndexLoaded(void) const {
249 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
250 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
251 for ( ; readerIter != readerEnd; ++readerIter ) {
252 const BamReader* reader = (*readerIter).first;
253 if ( reader ) ok &= reader->IsIndexLoaded();
258 // jumps to specified region(refID, leftBound) in BAM files, returns success/fail
259 bool BamMultiReaderPrivate::Jump(int refID, int position) {
262 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
263 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
264 for ( ; readerIter != readerEnd; ++readerIter ) {
265 BamReader* reader = (*readerIter).first;
266 if ( reader == 0 ) continue;
268 ok &= reader->Jump(refID, position);
270 cerr << "ERROR: could not jump " << reader->GetFilename()
271 << " to " << refID << ":" << position << endl;
276 if (ok) UpdateAlignments();
280 bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al, bool coreMode) {
282 // bail out if no more data to process
283 if ( !HasOpenReaders() ) return false;
285 // "pop" next alignment and reader
286 ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst();
287 BamReader* reader = nextReaderAlignment.first;
288 BamAlignment* alignment = nextReaderAlignment.second;
290 // save it by copy to our argument
291 al = BamAlignment(*alignment);
293 // peek to next alignment & store in cache
294 m_isCoreMode = coreMode;
295 SaveNextAlignment(reader,alignment);
302 bool BamMultiReaderPrivate::Open(const vector<string>& filenames,
305 bool preferStandardIndex)
307 // store core mode flag
308 m_isCoreMode = coreMode;
310 // first clear out any prior alignment cache prior data
311 if ( m_alignments ) {
312 m_alignments->Clear();
317 // create alignment cache based on sorting mode
318 m_alignments = CreateMergerForCurrentSortOrder();
319 if ( m_alignments == 0 ) return false;
321 // iterate over filenames
322 vector<string>::const_iterator filenameIter = filenames.begin();
323 vector<string>::const_iterator filenameEnd = filenames.end();
324 for ( ; filenameIter != filenameEnd; ++filenameIter ) {
325 const string filename = (*filenameIter);
327 bool openedOk = true;
328 BamReader* reader = new BamReader;
329 openedOk = reader->Open(filename, "", openIndexes, preferStandardIndex);
334 // try to read first alignment
336 BamAlignment* alignment = new BamAlignment;
337 fileOk &= ( coreMode ? reader->GetNextAlignmentCore(*alignment)
338 : reader->GetNextAlignment(*alignment) );
342 m_readers.push_back( make_pair(reader, alignment) );
343 m_alignments->Add( make_pair(reader, alignment) );
346 cerr << "WARNING: could not read first alignment in "
347 << filename << ", ignoring file" << endl;
349 // if only file available & could not be read, return failure
350 if ( filenames.size() == 1 )
355 // TODO; any further error handling when openedOK is false ??
359 // files opened ok, at least one alignment could be read,
360 // now need to check that all files use same reference data
365 // print associated filenames to stdout
366 void BamMultiReaderPrivate::PrintFilenames(void) const {
368 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
369 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
370 for ( ; readerIter != readerEnd; ++readerIter ) {
371 const BamReader* reader = (*readerIter).first;
372 if ( reader == 0 ) continue;
373 cout << reader->GetFilename() << endl;
377 // returns BAM file pointers to beginning of alignment data
378 bool BamMultiReaderPrivate::Rewind(void) {
381 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
382 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
383 for ( ; readerIter != readerEnd; ++readerIter ) {
384 BamReader* reader = (*readerIter).first;
385 if ( reader == 0 ) continue;
386 result &= reader->Rewind();
391 void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
393 // must be in core mode && NOT sorting by read name to call GNACore()
394 if ( m_isCoreMode && m_sortOrder != BamMultiReader::SortedByReadName ) {
395 if ( reader->GetNextAlignmentCore(*alignment) )
396 m_alignments->Add( make_pair(reader, alignment) );
399 // not in core mode and/or sorting by readname, must call GNA()
401 if ( reader->GetNextAlignment(*alignment) )
402 m_alignments->Add( make_pair(reader, alignment) );
406 // sets the index caching mode on the readers
407 void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
409 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
410 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
411 for ( ; readerIter != readerEnd; ++readerIter ) {
412 BamReader* reader = (*readerIter).first;
413 if ( reader == 0 ) continue;
414 reader->SetIndexCacheMode(mode);
418 bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
420 // NB: While it may make sense to track readers in which we can
421 // successfully SetRegion, In practice a failure of SetRegion means "no
422 // alignments here." It makes sense to simply accept the failure,
423 // UpdateAlignments(), and continue.
425 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
426 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
427 for ( ; readerIter != readerEnd; ++readerIter ) {
428 BamReader* reader = (*readerIter).first;
429 if ( reader == 0 ) continue;
430 if ( !reader->SetRegion(region) ) {
431 cerr << "ERROR: could not jump " << reader->GetFilename() << " to "
432 << region.LeftRefID << ":" << region.LeftPosition << ".."
433 << region.RightRefID << ":" << region.RightPosition << endl;
441 void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) {
443 // skip if no change needed
444 if ( m_sortOrder == order ) return;
446 // set new sort order
449 // create new alignment cache based on sort order
450 IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder();
451 if ( newAlignmentCache == 0 ) return; // print error?
453 // copy old cache contents to new cache
454 while ( m_alignments->Size() > 0 ) {
455 ReaderAlignment value = m_alignments->TakeFirst();
456 newAlignmentCache->Add(value);
459 // remove old cache structure & point to new cache
461 m_alignments = newAlignmentCache;
464 // updates our alignment cache
465 void BamMultiReaderPrivate::UpdateAlignments(void) {
468 m_alignments->Clear();
470 // iterate over readers
471 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
472 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
473 for ( ; readerIter != readerEnd; ++readerIter ) {
474 BamReader* reader = (*readerIter).first;
475 BamAlignment* alignment = (*readerIter).second;
476 if ( reader == 0 ) continue;
477 SaveNextAlignment(reader, alignment);
481 // splits the entire header into a list of strings
482 const vector<string> BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const {
483 stringstream header(headerText);
484 vector<string> lines;
486 while ( getline(header, item) )
487 lines.push_back(item);
491 // ValidateReaders checks that all the readers point to BAM files representing
492 // alignments against the same set of reference sequences, and that the
493 // sequences are identically ordered. If these checks fail the operation of
494 // the multireader is undefined, so we force program exit.
495 void BamMultiReaderPrivate::ValidateReaders(void) const {
497 // retrieve first reader data
498 const BamReader* firstReader = m_readers.front().first;
499 if ( firstReader == 0 ) return; // signal error?
500 const RefVector firstReaderRefData = firstReader->GetReferenceData();
501 const int firstReaderRefCount = firstReader->GetReferenceCount();
502 const int firstReaderRefSize = firstReaderRefData.size();
504 // iterate over all readers
505 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
506 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
507 for ( ; readerIter != readerEnd; ++readerIter ) {
509 // get current reader data
510 BamReader* reader = (*readerIter).first;
511 if ( reader == 0 ) continue; // error?
512 const RefVector currentReaderRefData = reader->GetReferenceData();
513 const int currentReaderRefCount = reader->GetReferenceCount();
514 const int currentReaderRefSize = currentReaderRefData.size();
516 // init container iterators
517 RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
518 RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
519 RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
521 // compare reference counts from BamReader ( & container size, in case of BR error)
522 if ( (currentReaderRefCount != firstReaderRefCount) ||
523 (firstReaderRefSize != currentReaderRefSize) )
525 cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
526 << " expected " << firstReaderRefCount
527 << " reference sequences but only found " << currentReaderRefCount << endl;
531 // this will be ok; we just checked above that we have identically-sized sets of references
532 // here we simply check if they are all, in fact, equal in content
533 while ( firstRefIter != firstRefEnd ) {
535 const RefData& firstRef = (*firstRefIter);
536 const RefData& currentRef = (*currentRefIter);
538 // compare reference name & length
539 if ( (firstRef.RefName != currentRef.RefName) ||
540 (firstRef.RefLength != currentRef.RefLength) )
542 cerr << "ERROR: mismatched references found in " << reader->GetFilename()
543 << " expected: " << endl;
544 RefVector::const_iterator refIter = firstReaderRefData.begin();
545 RefVector::const_iterator refEnd = firstReaderRefData.end();
546 for ( ; refIter != refEnd; ++refIter ) {
547 const RefData& entry = (*refIter);
548 cerr << entry.RefName << " " << entry.RefLength << endl;
551 cerr << "but found: " << endl;
552 refIter = currentReaderRefData.begin();
553 refEnd = currentReaderRefData.end();
554 for ( ; refIter != refEnd; ++refIter ) {
555 const RefData& entry = (*refIter);
556 cerr << entry.RefName << " " << entry.RefLength << endl;