1 // ***************************************************************************
2 // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 3 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Functionality for simultaneously reading multiple BAM files
8 // *************************************************************************
10 #include <api/BamAlignment.h>
11 #include <api/BamMultiReader.h>
12 #include <api/SamConstants.h>
13 #include <api/algorithms/Sort.h>
14 #include <api/internal/BamMultiReader_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
26 BamMultiReaderPrivate::BamMultiReaderPrivate(void)
31 BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
33 // close all open BAM readers (& clean up cache)
37 // close all BAM files
38 void BamMultiReaderPrivate::Close(void) {
39 CloseFiles( Filenames() );
42 // close requested BAM file
43 void BamMultiReaderPrivate::CloseFile(const string& filename) {
44 vector<string> filenames(1, filename);
45 CloseFiles(filenames);
48 // close requested BAM files
49 void BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
51 // iterate over filenames
52 vector<string>::const_iterator filesIter = filenames.begin();
53 vector<string>::const_iterator filesEnd = filenames.end();
54 for ( ; filesIter != filesEnd; ++filesIter ) {
55 const string& filename = (*filesIter);
56 if ( filename.empty() ) continue;
58 // iterate over readers
59 vector<MergeItem>::iterator readerIter = m_readers.begin();
60 vector<MergeItem>::iterator readerEnd = m_readers.end();
61 for ( ; readerIter != readerEnd; ++readerIter ) {
62 MergeItem& item = (*readerIter);
63 BamReader* reader = item.Reader;
64 if ( reader == 0 ) continue;
66 // if reader matches requested filename
67 if ( reader->GetFilename() == filename ) {
69 // remove reader's entry from alignment cache
70 m_alignmentCache->Remove(reader);
72 // clean up reader & its alignment
77 // delete reader's alignment entry
78 BamAlignment* alignment = item.Alignment;
82 // remove reader from reader list
83 m_readers.erase(readerIter);
85 // on match, just go on to next filename
86 // (no need to keep looking and item iterator is invalid now anyway)
92 // make sure alignment cache is cleaned up if all readers closed
93 if ( m_readers.empty() && m_alignmentCache ) {
94 m_alignmentCache->Clear();
95 delete m_alignmentCache;
100 // creates index files for BAM files that don't have them
101 bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) {
105 // iterate over readers
106 vector<MergeItem>::iterator itemIter = m_readers.begin();
107 vector<MergeItem>::iterator itemEnd = m_readers.end();
108 for ( ; itemIter != itemEnd; ++itemIter ) {
109 MergeItem& item = (*itemIter);
110 BamReader* reader = item.Reader;
111 if ( reader == 0 ) continue;
113 // if reader doesn't have an index, create one
114 if ( !reader->HasIndex() )
115 result &= reader->CreateIndex(type);
121 IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const {
124 SamHeader header = GetHeader();
126 // if BAM files are sorted by position
127 if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE )
128 return new MultiMerger<Algorithms::Sort::ByPosition>();
130 // if BAM files are sorted by read name
131 if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME )
132 return new MultiMerger<Algorithms::Sort::ByName>();
134 // otherwise "unknown" or "unsorted", use unsorted merger and just read in
135 return new MultiMerger<Algorithms::Sort::Unsorted>();
138 const vector<string> BamMultiReaderPrivate::Filenames(void) const {
140 // init filename container
141 vector<string> filenames;
142 filenames.reserve( m_readers.size() );
144 // iterate over readers
145 vector<MergeItem>::const_iterator itemIter = m_readers.begin();
146 vector<MergeItem>::const_iterator itemEnd = m_readers.end();
147 for ( ; itemIter != itemEnd; ++itemIter ) {
148 const MergeItem& item = (*itemIter);
149 const BamReader* reader = item.Reader;
150 if ( reader == 0 ) continue;
152 // store filename if not empty
153 const string& filename = reader->GetFilename();
154 if ( !filename.empty() )
155 filenames.push_back(filename);
162 SamHeader BamMultiReaderPrivate::GetHeader(void) const {
163 const string& text = GetHeaderText();
164 return SamHeader(text);
167 // makes a virtual, unified header for all the bam files in the multireader
168 string BamMultiReaderPrivate::GetHeaderText(void) const {
170 // N.B. - right now, simply copies all header data from first BAM,
171 // and then appends RG's from other BAM files
172 // TODO: make this more intelligent wrt other header lines/fields
174 // if no readers open
175 const size_t numReaders = m_readers.size();
176 if ( numReaders == 0 ) return string();
178 // retrieve first reader's header
179 const MergeItem& firstItem = m_readers.front();
180 const BamReader* reader = firstItem.Reader;
181 if ( reader == 0 ) return string();
182 SamHeader mergedHeader = reader->GetHeader();
184 // iterate over any remaining readers (skipping the first)
185 for ( size_t i = 1; i < numReaders; ++i ) {
186 const MergeItem& item = m_readers.at(i);
187 const BamReader* reader = item.Reader;
188 if ( reader == 0 ) continue;
190 // retrieve current reader's header
191 const SamHeader currentHeader = reader->GetHeader();
193 // append current reader's RG entries to merged header
194 // N.B. - SamReadGroupDictionary handles duplicate-checking
195 mergedHeader.ReadGroups.Add(currentHeader.ReadGroups);
197 // TODO: merge anything else??
200 // return stringified header
201 return mergedHeader.ToString();
204 // get next alignment among all files
205 bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
206 return PopNextCachedAlignment(al, true);
209 // get next alignment among all files without parsing character data from alignments
210 bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
211 return PopNextCachedAlignment(al, false);
214 // ---------------------------------------------------------------------------------------
216 // NB: The following GetReferenceX() functions assume that we have identical
217 // references for all BAM files. We enforce this by invoking the
218 // ValidateReaders() method to verify that our reference data is the same
219 // across all files on Open - so we will not encounter a situation in which
220 // there is a mismatch and we are still live.
222 // ---------------------------------------------------------------------------------------
224 // returns the number of reference sequences
225 int BamMultiReaderPrivate::GetReferenceCount(void) const {
227 // handle empty multireader
228 if ( m_readers.empty() )
231 // return reference count from first reader
232 const MergeItem& item = m_readers.front();
233 const BamReader* reader = item.Reader;
234 if ( reader ) return reader->GetReferenceCount();
240 // returns vector of reference objects
241 const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
243 // handle empty multireader
244 if ( m_readers.empty() )
247 // return reference data from first BamReader
248 const MergeItem& item = m_readers.front();
249 const BamReader* reader = item.Reader;
250 if ( reader ) return reader->GetReferenceData();
256 // returns refID from reference name
257 int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
259 // handle empty multireader
260 if ( m_readers.empty() )
263 // return reference ID from first BamReader
264 const MergeItem& item = m_readers.front();
265 const BamReader* reader = item.Reader;
266 if ( reader ) return reader->GetReferenceID(refName);
271 // ---------------------------------------------------------------------------------------
273 // returns true if all readers have index data available
274 // this is useful to indicate whether Jump() or SetRegion() are possible
275 bool BamMultiReaderPrivate::HasIndexes(void) const {
277 // handle empty multireader
278 if ( m_readers.empty() )
283 // iterate over readers
284 vector<MergeItem>::const_iterator readerIter = m_readers.begin();
285 vector<MergeItem>::const_iterator readerEnd = m_readers.end();
286 for ( ; readerIter != readerEnd; ++readerIter ) {
287 const MergeItem& item = (*readerIter);
288 const BamReader* reader = item.Reader;
289 if ( reader == 0 ) continue;
291 // see if current reader has index data
292 result &= reader->HasIndex();
298 // returns true if multireader has open readers
299 bool BamMultiReaderPrivate::HasOpenReaders(void) {
301 // iterate over readers
302 vector<MergeItem>::const_iterator readerIter = m_readers.begin();
303 vector<MergeItem>::const_iterator readerEnd = m_readers.end();
304 for ( ; readerIter != readerEnd; ++readerIter ) {
305 const MergeItem& item = (*readerIter);
306 const BamReader* reader = item.Reader;
307 if ( reader == 0 ) continue;
309 // return true whenever an open reader is found
310 if ( reader->IsOpen() ) return true;
317 // performs random-access jump using (refID, position) as a left-bound
318 bool BamMultiReaderPrivate::Jump(int refID, int position) {
320 // NB: While it may make sense to track readers in which we can
321 // successfully Jump, in practice a failure of Jump means "no
322 // alignments here." It makes sense to simply accept the failure,
323 // UpdateAlignments(), and continue.
325 // iterate over readers
326 vector<MergeItem>::iterator readerIter = m_readers.begin();
327 vector<MergeItem>::iterator readerEnd = m_readers.end();
328 for ( ; readerIter != readerEnd; ++readerIter ) {
329 MergeItem& item = (*readerIter);
330 BamReader* reader = item.Reader;
331 if ( reader == 0 ) continue;
333 // attempt jump() on each
334 if ( !reader->Jump(refID, position) ) {
335 cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename()
336 << " to " << refID << ":" << position << endl;
340 // returns status of cache update
341 return UpdateAlignmentCache();
344 // locate (& load) index files for BAM readers that don't already have one loaded
345 bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) {
349 // iterate over readers
350 vector<MergeItem>::iterator readerIter = m_readers.begin();
351 vector<MergeItem>::iterator readerEnd = m_readers.end();
352 for ( ; readerIter != readerEnd; ++readerIter ) {
353 MergeItem& item = (*readerIter);
354 BamReader* reader = item.Reader;
355 if ( reader == 0 ) continue;
357 // if reader has no index, try to locate one
358 if ( !reader->HasIndex() )
359 result &= reader->LocateIndex(preferredType);
366 bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
368 bool openedOk = true;
370 // put all current readers back at beginning
371 openedOk &= Rewind();
373 // put all current readers back at beginning (refreshes alignment cache)
376 // iterate over filenames
377 vector<string>::const_iterator filenameIter = filenames.begin();
378 vector<string>::const_iterator filenameEnd = filenames.end();
379 for ( ; filenameIter != filenameEnd; ++filenameIter ) {
380 const string& filename = (*filenameIter);
381 if ( filename.empty() ) continue;
383 // attempt to open BamReader
384 BamReader* reader = new BamReader;
385 const bool readerOpened = reader->Open(filename);
387 // if opened OK, store it
389 m_readers.push_back( MergeItem(reader, new BamAlignment) );
391 // otherwise clean up invalid reader
394 // update method return status
395 openedOk &= readerOpened;
398 // if more than one reader open, check for consistency
399 if ( m_readers.size() > 1 )
400 openedOk &= ValidateReaders();
402 // update alignment cache
403 openedOk &= UpdateAlignmentCache();
409 bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
410 vector<string> filenames(1, filename);
411 return Open(filenames);
414 bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) {
416 // TODO: This needs to be cleaner - should not assume same order.
417 // And either way, shouldn't start at first reader. Should start at
418 // first reader without an index?
420 // make sure same number of index filenames as readers
421 if ( m_readers.size() != indexFilenames.size() )
427 // iterate over BamReaders
428 vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
429 vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
430 vector<MergeItem>::iterator readerIter = m_readers.begin();
431 vector<MergeItem>::iterator readerEnd = m_readers.end();
432 for ( ; readerIter != readerEnd; ++readerIter ) {
433 MergeItem& item = (*readerIter);
434 BamReader* reader = item.Reader;
436 // open index filename on reader
438 const string& indexFilename = (*indexFilenameIter);
439 result &= reader->OpenIndex(indexFilename);
442 // increment filename iterator, skip if no more index files to open
443 if ( ++indexFilenameIter == indexFilenameEnd )
447 // TODO: any validation needed here??
449 // return success/fail
454 bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
456 // skip if no alignments available
457 if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() )
460 // pop next merge item entry from cache
461 MergeItem item = m_alignmentCache->TakeFirst();
462 BamReader* reader = item.Reader;
463 BamAlignment* alignment = item.Alignment;
464 if ( reader == 0 || alignment == 0 )
467 // set char data if requested
468 if ( needCharData ) {
469 alignment->BuildCharData();
470 alignment->Filename = reader->GetFilename();
473 // store cached alignment into destination parameter (by copy)
476 // load next alignment from reader & store in cache
477 SaveNextAlignment(reader, alignment);
483 bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
485 // bail out if no more data to process
486 if ( !HasAlignmentData() )
489 // pop next reader/alignment pair
490 ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst();
491 BamReader* reader = nextReaderAlignment.first;
492 BamAlignment* alignment = nextReaderAlignment.second;
494 // store cached alignment into destination parameter (by copy)
497 // set char data if requested
498 if ( needCharData ) {
500 al.Filename = reader->GetFilename();
503 // load next alignment from reader & store in cache
504 SaveNextAlignment(reader, alignment);
510 // returns BAM file pointers to beginning of alignment data & resets alignment cache
511 bool BamMultiReaderPrivate::Rewind(void) {
513 // attempt to rewind files
514 if ( !RewindReaders() ) {
515 cerr << "BamMultiReader ERROR: could not rewind file(s) successfully";
519 // return status of cache update
520 return UpdateAlignmentCache();
523 // returns BAM file pointers to beginning of alignment data
524 bool BamMultiReaderPrivate::RewindReaders(void) {
528 // iterate over readers
529 vector<MergeItem>::iterator readerIter = m_readers.begin();
530 vector<MergeItem>::iterator readerEnd = m_readers.end();
531 for ( ; readerIter != readerEnd; ++readerIter ) {
532 MergeItem& item = (*readerIter);
533 BamReader* reader = item.Reader;
534 if ( reader == 0 ) continue;
536 // attempt rewind on BamReader
537 result &= reader->Rewind();
543 void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
545 // if can read alignment from reader, store in cache
546 // N.B. - lazy building of alignment's char data,
547 // only populated on demand by sorting merger or client call to GetNextAlignment()
548 if ( reader->GetNextAlignmentCore(*alignment) )
549 m_alignmentCache->Add(MergeItem(reader, alignment));
552 // sets the index caching mode on the readers
553 void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) {
555 // iterate over readers
556 vector<MergeItem>::iterator readerIter = m_readers.begin();
557 vector<MergeItem>::iterator readerEnd = m_readers.end();
558 for ( ; readerIter != readerEnd; ++readerIter ) {
559 MergeItem& item = (*readerIter);
560 BamReader* reader = item.Reader;
561 if ( reader == 0 ) continue;
563 // set reader's index cache mode
564 reader->SetIndexCacheMode(mode);
568 bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
570 // NB: While it may make sense to track readers in which we can
571 // successfully SetRegion, In practice a failure of SetRegion means "no
572 // alignments here." It makes sense to simply accept the failure,
573 // UpdateAlignments(), and continue.
575 // iterate over alignments
576 vector<MergeItem>::iterator readerIter = m_readers.begin();
577 vector<MergeItem>::iterator readerEnd = m_readers.end();
578 for ( ; readerIter != readerEnd; ++readerIter ) {
579 MergeItem& item = (*readerIter);
580 BamReader* reader = item.Reader;
581 if ( reader == 0 ) continue;
583 // attempt to set BamReader's region of interest
584 if ( !reader->SetRegion(region) ) {
585 cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to "
586 << region.LeftRefID << ":" << region.LeftPosition << ".."
587 << region.RightRefID << ":" << region.RightPosition << endl;
591 // return status of cache update
592 return UpdateAlignmentCache();
595 // updates our alignment cache
596 bool BamMultiReaderPrivate::UpdateAlignmentCache(void) {
598 // create alignment cache if not created yet
599 if ( m_alignmentCache == 0 ) {
600 m_alignmentCache = CreateAlignmentCache();
601 if ( m_alignmentCache == 0 ) {
607 // clear any prior cache data
608 m_alignmentCache->Clear();
610 // iterate over readers
611 vector<MergeItem>::iterator readerIter = m_readers.begin();
612 vector<MergeItem>::iterator readerEnd = m_readers.end();
613 for ( ; readerIter != readerEnd; ++readerIter ) {
614 MergeItem& item = (*readerIter);
615 BamReader* reader = item.Reader;
616 BamAlignment* alignment = item.Alignment;
617 if ( reader == 0 || alignment == 0 ) continue;
619 // save next alignment from each reader in cache
620 SaveNextAlignment(reader, alignment);
623 // if we get here, ok
627 // ValidateReaders checks that all the readers point to BAM files representing
628 // alignments against the same set of reference sequences, and that the
629 // sequences are identically ordered. If these checks fail the operation of
630 // the multireader is undefined, so we force program exit.
631 bool BamMultiReaderPrivate::ValidateReaders(void) const {
633 // skip if no readers opened
634 if ( m_readers.empty() )
637 // retrieve first reader
638 const MergeItem& firstItem = m_readers.front();
639 const BamReader* firstReader = firstItem.Reader;
640 if ( firstReader == 0 ) return false;
642 // retrieve first reader's header data
643 const SamHeader& firstReaderHeader = firstReader->GetHeader();
644 const string& firstReaderSortOrder = firstReaderHeader.SortOrder;
646 // retrieve first reader's reference data
647 const RefVector& firstReaderRefData = firstReader->GetReferenceData();
648 const int firstReaderRefCount = firstReader->GetReferenceCount();
649 const int firstReaderRefSize = firstReaderRefData.size();
651 // iterate over all readers
652 vector<MergeItem>::const_iterator readerIter = m_readers.begin();
653 vector<MergeItem>::const_iterator readerEnd = m_readers.end();
654 for ( ; readerIter != readerEnd; ++readerIter ) {
655 const MergeItem& item = (*readerIter);
656 BamReader* reader = item.Reader;
657 if ( reader == 0 ) continue;
659 // get current reader's header data
660 const SamHeader& currentReaderHeader = reader->GetHeader();
661 const string& currentReaderSortOrder = currentReaderHeader.SortOrder;
663 // check compatible sort order
664 if ( currentReaderSortOrder != firstReaderSortOrder ) {
666 cerr << "BamMultiReader ERROR: mismatched sort order in " << reader->GetFilename()
667 << ", expected " << firstReaderSortOrder
668 << ", but found " << currentReaderSortOrder << endl;
672 // get current reader's reference data
673 const RefVector currentReaderRefData = reader->GetReferenceData();
674 const int currentReaderRefCount = reader->GetReferenceCount();
675 const int currentReaderRefSize = currentReaderRefData.size();
677 // init reference data iterators
678 RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
679 RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
680 RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
682 // compare reference counts from BamReader ( & container size, in case of BR error)
683 if ( (currentReaderRefCount != firstReaderRefCount) ||
684 (firstReaderRefSize != currentReaderRefSize) )
686 cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename()
687 << " expected " << firstReaderRefCount
688 << " reference sequences but only found " << currentReaderRefCount << endl;
692 // this will be ok; we just checked above that we have identically-sized sets of references
693 // here we simply check if they are all, in fact, equal in content
694 while ( firstRefIter != firstRefEnd ) {
695 const RefData& firstRef = (*firstRefIter);
696 const RefData& currentRef = (*currentRefIter);
698 // compare reference name & length
699 if ( (firstRef.RefName != currentRef.RefName) ||
700 (firstRef.RefLength != currentRef.RefLength) )
702 cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename()
703 << " expected: " << endl;
705 // print first reader's reference data
706 RefVector::const_iterator refIter = firstReaderRefData.begin();
707 RefVector::const_iterator refEnd = firstReaderRefData.end();
708 for ( ; refIter != refEnd; ++refIter ) {
709 const RefData& entry = (*refIter);
710 cerr << entry.RefName << " " << entry.RefLength << endl;
713 cerr << "but found: " << endl;
715 // print current reader's reference data
716 refIter = currentReaderRefData.begin();
717 refEnd = currentReaderRefData.end();
718 for ( ; refIter != refEnd; ++refIter ) {
719 const RefData& entry = (*refIter);
720 cerr << entry.RefName << " " << entry.RefLength << endl;
732 // if we get here, everything checks out