1 // ***************************************************************************
2 // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 9 September 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Functionality for simultaneously reading multiple BAM files
9 // *************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/BamMultiReader.h>
13 #include <api/internal/BamMultiMerger_p.h>
14 #include <api/internal/BamMultiReader_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
26 BamMultiReaderPrivate::BamMultiReaderPrivate(void)
28 , m_sortOrder(BamMultiReader::SortedByPosition)
32 BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
34 // close all open BAM readers
37 // clean up alignment cache
42 // close all BAM files
43 void BamMultiReaderPrivate::Close(void) {
44 CloseFiles( Filenames() );
47 // close requested BAM file
48 void BamMultiReaderPrivate::CloseFile(const string& filename) {
49 vector<string> filenames(1, filename);
50 CloseFiles(filenames);
53 // close requested BAM files
54 void BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
56 // iterate over filenames
57 vector<string>::const_iterator filesIter = filenames.begin();
58 vector<string>::const_iterator filesEnd = filenames.end();
59 for ( ; filesIter != filesEnd; ++filesIter ) {
60 const string& filename = (*filesIter);
61 if ( filename.empty() ) continue;
63 // iterate over readers
64 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
65 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
66 for ( ; readerIter != readerEnd; ++readerIter ) {
67 BamReader* reader = (*readerIter).first;
68 if ( reader == 0 ) continue;
70 // if reader matches requested filename
71 if ( reader->GetFilename() == filename ) {
73 // remove reader/alignment from alignment cache
74 m_alignments->Remove(reader);
76 // close & delete reader
81 // delete reader's alignment entry
82 BamAlignment* alignment = (*readerIter).second;
86 // remove reader from container
87 m_readers.erase(readerIter);
89 // on match, just go on to next filename
90 // (no need to keep looking and iterator is invalid now anyway)
96 // make sure alignment cache is cleared if all readers are now closed
97 if ( m_readers.empty() && m_alignments != 0 )
98 m_alignments->Clear();
101 // creates index files for BAM files that don't have them
102 bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) {
106 // iterate over readers
107 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
108 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
109 for ( ; readerIter != readerEnd; ++readerIter ) {
110 BamReader* reader = (*readerIter).first;
111 if ( reader == 0 ) continue;
113 // if reader doesn't have an index, create one
114 if ( !reader->HasIndex() )
115 result &= reader->CreateIndex(type);
121 IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const {
122 switch ( m_sortOrder ) {
123 case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger;
124 case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger;
125 case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger;
127 cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl;
132 const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const {
134 string readGroup("");
135 stringstream headerLineSs(headerLine);
138 // parse @RG header line, looking for the ID: tag
139 while( getline(headerLineSs, part, '\t') ) {
140 stringstream partSs(part);
142 getline(partSs, subtag, ':');
143 if ( subtag == "ID" ) {
144 getline(partSs, readGroup, ':');
151 const vector<string> BamMultiReaderPrivate::Filenames(void) const {
153 // init filename container
154 vector<string> filenames;
155 filenames.reserve( m_readers.size() );
157 // iterate over readers
158 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
159 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
160 for ( ; readerIter != readerEnd; ++readerIter ) {
161 const BamReader* reader = (*readerIter).first;
162 if ( reader == 0 ) continue;
164 // store filename if not empty
165 const string filename = reader->GetFilename();
166 if ( !filename.empty() )
167 filenames.push_back( reader->GetFilename() );
174 SamHeader BamMultiReaderPrivate::GetHeader(void) const {
175 string text = GetHeaderText();
176 return SamHeader(text);
179 // makes a virtual, unified header for all the bam files in the multireader
180 string BamMultiReaderPrivate::GetHeaderText(void) const {
182 // TODO: merge SamHeader objects instead of parsing string data (again)
184 // if only one reader is open
185 if ( m_readers.size() == 1 ) {
187 // just return reader's header text
188 const ReaderAlignment& ra = m_readers.front();
189 const BamReader* reader = ra.first;
190 if ( reader ) return reader->GetHeaderText();
196 string mergedHeader("");
197 map<string, bool> readGroups;
199 // foreach extraction entry (each BAM file)
200 vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin();
201 vector<ReaderAlignment>::const_iterator readerIter = readerBegin;
202 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
203 for ( ; readerIter != readerEnd; ++readerIter ) {
204 const BamReader* reader = (*readerIter).first;
205 if ( reader == 0 ) continue;
207 // get header from reader
208 string headerText = reader->GetHeaderText();
209 if ( headerText.empty() ) continue;
211 // store header text in lines
212 map<string, bool> currentFileReadGroups;
213 const vector<string> lines = SplitHeaderText(headerText);
215 // iterate over header lines
216 vector<string>::const_iterator linesIter = lines.begin();
217 vector<string>::const_iterator linesEnd = lines.end();
218 for ( ; linesIter != linesEnd; ++linesIter ) {
220 // get next line from header, skip if empty
221 const string headerLine = (*linesIter);
222 if ( headerLine.empty() ) continue;
224 // if first file, save HD & SQ entries
225 // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ?
226 if ( readerIter == readerBegin ) {
227 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
228 mergedHeader.append(headerLine.c_str());
229 mergedHeader.append(1, '\n');
233 // (for all files) append RG entries if they are unique
234 if ( headerLine.find("@RG") == 0 ) {
236 // extract read group name from line
237 const string readGroup = ExtractReadGroup(headerLine);
239 // make sure not to duplicate @RG entries
240 if ( readGroups.find(readGroup) == readGroups.end() ) {
241 mergedHeader.append(headerLine.c_str() );
242 mergedHeader.append(1, '\n');
243 readGroups[readGroup] = true;
244 currentFileReadGroups[readGroup] = true;
246 // warn iff we are reading one file and discover duplicated @RG tags in the header
247 // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
248 if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) {
249 cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup
250 << " entry in header of " << reader->GetFilename() << endl;
257 // return merged header text
261 // get next alignment among all files
262 bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
263 return PopNextCachedAlignment(al, true);
266 // get next alignment among all files without parsing character data from alignments
267 bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
268 return PopNextCachedAlignment(al, false);
271 // ---------------------------------------------------------------------------------------
273 // NB: The following GetReferenceX() functions assume that we have identical
274 // references for all BAM files. We enforce this by invoking the
275 // ValidateReaders() method to verify that our reference data is the same
276 // across all files on Open - so we will not encounter a situation in which
277 // there is a mismatch and we are still live.
279 // ---------------------------------------------------------------------------------------
281 // returns the number of reference sequences
282 int BamMultiReaderPrivate::GetReferenceCount(void) const {
284 // handle empty multireader
285 if ( m_readers.empty() )
288 // return reference count from first reader
289 const ReaderAlignment& ra = m_readers.front();
290 const BamReader* reader = ra.first;
291 if ( reader ) return reader->GetReferenceCount();
297 // returns vector of reference objects
298 const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
300 // handle empty multireader
301 if ( m_readers.empty() )
304 // return reference data from first BamReader
305 const ReaderAlignment& ra = m_readers.front();
306 const BamReader* reader = ra.first;
307 if ( reader ) return reader->GetReferenceData();
313 // returns refID from reference name
314 int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
316 // handle empty multireader
317 if ( m_readers.empty() )
320 // return reference ID from first BamReader
321 const ReaderAlignment& ra = m_readers.front();
322 const BamReader* reader = ra.first;
323 if ( reader ) return reader->GetReferenceID(refName);
328 // ---------------------------------------------------------------------------------------
330 // checks if any readers still have alignments
331 bool BamMultiReaderPrivate::HasAlignmentData(void) const {
332 if ( m_alignments == 0 )
334 return !m_alignments->IsEmpty();
337 // returns true if all readers have index data available
338 // this is useful to indicate whether Jump() or SetRegion() are possible
339 bool BamMultiReaderPrivate::HasIndexes(void) const {
341 // handle empty multireader
342 if ( m_readers.empty() )
347 // iterate over readers
348 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
349 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
350 for ( ; readerIter != readerEnd; ++readerIter ) {
351 const BamReader* reader = (*readerIter).first;
352 if ( reader == 0 ) continue;
354 // see if current reader has index data
355 result &= reader->HasIndex();
361 // returns true if multireader has open readers
362 bool BamMultiReaderPrivate::HasOpenReaders(void) {
364 // iterate over readers
365 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
366 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
367 for ( ; readerIter != readerEnd; ++readerIter ) {
368 const BamReader* reader = (*readerIter).first;
369 if ( reader == 0 ) continue;
371 // return true whenever an open reader is found
372 if ( reader->IsOpen() ) return true;
379 // performs random-access jump using (refID, position) as a left-bound
380 bool BamMultiReaderPrivate::Jump(int refID, int position) {
382 // NB: While it may make sense to track readers in which we can
383 // successfully Jump, in practice a failure of Jump means "no
384 // alignments here." It makes sense to simply accept the failure,
385 // UpdateAlignments(), and continue.
387 // iterate over readers
388 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
389 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
390 for ( ; readerIter != readerEnd; ++readerIter ) {
391 BamReader* reader = (*readerIter).first;
392 if ( reader == 0 ) continue;
394 // attempt jump() on each
395 if ( !reader->Jump(refID, position) ) {
396 cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename()
397 << " to " << refID << ":" << position << endl;
401 // update alignment cache & return success
402 UpdateAlignmentCache();
406 bool BamMultiReaderPrivate::LoadNextAlignment(BamReader* reader, BamAlignment* alignment) {
407 // lazy building of alignment's char data,
408 // only populated on demand by sorting merger or client call to GetNextAlignment()
409 return reader->GetNextAlignmentCore(*alignment);
412 // locate (& load) index files for BAM readers that don't already have one loaded
413 bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) {
417 // iterate over readers
418 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
419 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
420 for ( ; readerIter != readerEnd; ++readerIter ) {
421 BamReader* reader = (*readerIter).first;
422 if ( reader == 0 ) continue;
424 // if reader has no index, try to locate one
425 if ( !reader->HasIndex() )
426 result &= reader->LocateIndex(preferredType);
433 bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
435 // create alignment cache if neccessary
436 if ( m_alignments == 0 ) {
437 m_alignments = CreateMergerForCurrentSortOrder();
438 if ( m_alignments == 0 ) return false;
441 // put all current readers back at beginning (refreshes alignment cache)
444 // iterate over filenames
445 vector<string>::const_iterator filenameIter = filenames.begin();
446 vector<string>::const_iterator filenameEnd = filenames.end();
447 for ( ; filenameIter != filenameEnd; ++filenameIter ) {
448 const string& filename = (*filenameIter);
449 if ( filename.empty() ) continue;
451 // attempt to open BamReader on filename
452 bool openedOk = false;
453 ReaderAlignment ra = OpenReader(filename, &openedOk);
455 m_readers.push_back(ra); // store reader/alignment in local list
456 m_alignments->Add(ra); // add reader/alignment to sorting cache
460 // if more than one reader open, check for reference consistency
461 if ( m_readers.size() > 1 )
468 bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
469 vector<string> filenames(1, filename);
470 return Open(filenames);
473 bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) {
475 // TODO: This needs to be cleaner - should not assume same order.
476 // And either way, shouldn't start at first reader. Should start at
477 // first reader without an index?
479 // make sure same number of index filenames as readers
480 if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() )
486 // iterate over BamReaders
487 vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
488 vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
489 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
490 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
491 for ( ; readerIter != readerEnd; ++readerIter ) {
492 BamReader* reader = (*readerIter).first;
494 // open index filename on reader
496 const string& indexFilename = (*indexFilenameIter);
497 result &= reader->OpenIndex(indexFilename);
500 // increment filename iterator, skip if no more index files to open
501 if ( ++indexFilenameIter == indexFilenameEnd )
505 // TODO: validation ??
507 // return success/fail
511 ReaderAlignment BamMultiReaderPrivate::OpenReader(const string& filename, bool* ok) {
516 // create new BamReader & BamAlignment
517 BamReader* reader = new BamReader;
518 BamAlignment* alignment = new BamAlignment;
520 // if reader opens OK
521 if ( reader->Open(filename) ) {
523 // if first alignment reads OK
524 if ( LoadNextAlignment(reader, alignment) ) {
526 return make_pair(reader, alignment);
529 // could not read alignment
531 cerr << "BamMultiReader WARNING: Could not read first alignment from "
532 << filename << ", ignoring file" << endl;
536 // reader could not open
538 cerr << "BamMultiReader WARNING: Could not open "
539 << filename << ", ignoring file" << endl;
542 // if we get here, there was a problem with this BAM file (opening or reading)
543 // clean up memory allocation & return null pointer
546 return ReaderAlignment();
549 // print associated filenames to stdout
550 void BamMultiReaderPrivate::PrintFilenames(void) const {
551 const vector<string>& filenames = Filenames();
552 vector<string>::const_iterator filenameIter = filenames.begin();
553 vector<string>::const_iterator filenameEnd = filenames.end();
554 for ( ; filenameIter != filenameEnd; ++filenameIter )
555 cout << (*filenameIter) << endl;
558 bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
560 // bail out if no more data to process
561 if ( !HasAlignmentData() )
564 // pop next reader/alignment pair
565 ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst();
566 BamReader* reader = nextReaderAlignment.first;
567 BamAlignment* alignment = nextReaderAlignment.second;
569 // store cached alignment into destination parameter (by copy)
572 // set char data if requested
573 if ( needCharData ) {
575 al.Filename = reader->GetFilename();
578 // load next alignment from reader & store in cache
579 SaveNextAlignment(reader, alignment);
585 // returns BAM file pointers to beginning of alignment data & resets alignment cache
586 bool BamMultiReaderPrivate::Rewind(void) {
588 // clear out alignment cache
589 m_alignments->Clear();
591 // attempt to rewind files
592 if ( !RewindReaders() ) {
593 cerr << "BamMultiReader ERROR: could not rewind file(s) successfully";
597 // reset cache & return success
598 UpdateAlignmentCache();
602 // returns BAM file pointers to beginning of alignment data
603 bool BamMultiReaderPrivate::RewindReaders(void) {
607 // iterate over readers
608 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
609 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
610 for ( ; readerIter != readerEnd; ++readerIter ) {
611 BamReader* reader = (*readerIter).first;
612 if ( reader == 0 ) continue;
614 // attempt rewind on BamReader
615 result &= reader->Rewind();
621 void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
623 // if can read alignment from reader, store in cache
624 if ( LoadNextAlignment(reader, alignment) )
625 m_alignments->Add( make_pair(reader, alignment) );
628 // sets the index caching mode on the readers
629 void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) {
631 // iterate over readers
632 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
633 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
634 for ( ; readerIter != readerEnd; ++readerIter ) {
635 BamReader* reader = (*readerIter).first;
636 if ( reader == 0 ) continue;
638 // set reader's index cache mode
639 reader->SetIndexCacheMode(mode);
643 bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
645 // NB: While it may make sense to track readers in which we can
646 // successfully SetRegion, In practice a failure of SetRegion means "no
647 // alignments here." It makes sense to simply accept the failure,
648 // UpdateAlignments(), and continue.
650 // iterate over alignments
651 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
652 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
653 for ( ; readerIter != readerEnd; ++readerIter ) {
654 BamReader* reader = (*readerIter).first;
655 if ( reader == 0 ) continue;
657 // attempt to set BamReader's region of interest
658 if ( !reader->SetRegion(region) ) {
659 cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to "
660 << region.LeftRefID << ":" << region.LeftPosition << ".."
661 << region.RightRefID << ":" << region.RightPosition << endl;
665 // update alignment cache & return success
666 UpdateAlignmentCache();
670 void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) {
672 // skip if no change needed
673 if ( m_sortOrder == order ) return;
675 // set new sort order
678 // create new alignment cache based on sort order
679 IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder();
680 if ( newAlignmentCache == 0 ) return; // print error?
682 // copy old cache contents to new cache
683 while ( m_alignments->Size() > 0 ) {
684 ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops'
685 newAlignmentCache->Add(value);
688 // remove old cache structure & point to new cache
690 m_alignments = newAlignmentCache;
693 // splits the entire header into a list of strings
694 const vector<string> BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const {
696 stringstream header(headerText);
699 vector<string> lines;
700 while ( getline(header, item) )
701 lines.push_back(item);
705 // updates our alignment cache
706 void BamMultiReaderPrivate::UpdateAlignmentCache(void) {
708 // skip if invalid alignment cache
709 if ( m_alignments == 0 ) return;
712 m_alignments->Clear();
714 // iterate over readers
715 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
716 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
717 for ( ; readerIter != readerEnd; ++readerIter ) {
718 BamReader* reader = (*readerIter).first;
719 BamAlignment* alignment = (*readerIter).second;
720 if ( reader == 0 || alignment == 0 ) continue;
722 // save next alignment from each reader in cache
723 SaveNextAlignment(reader, alignment);
727 // ValidateReaders checks that all the readers point to BAM files representing
728 // alignments against the same set of reference sequences, and that the
729 // sequences are identically ordered. If these checks fail the operation of
730 // the multireader is undefined, so we force program exit.
731 void BamMultiReaderPrivate::ValidateReaders(void) const {
733 // retrieve first reader data
734 const BamReader* firstReader = m_readers.front().first;
735 if ( firstReader == 0 ) return;
736 const RefVector firstReaderRefData = firstReader->GetReferenceData();
737 const int firstReaderRefCount = firstReader->GetReferenceCount();
738 const int firstReaderRefSize = firstReaderRefData.size();
740 // iterate over all readers
741 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
742 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
743 for ( ; readerIter != readerEnd; ++readerIter ) {
745 // get current reader data
746 BamReader* reader = (*readerIter).first;
747 if ( reader == 0 ) continue;
748 const RefVector currentReaderRefData = reader->GetReferenceData();
749 const int currentReaderRefCount = reader->GetReferenceCount();
750 const int currentReaderRefSize = currentReaderRefData.size();
752 // init container iterators
753 RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
754 RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
755 RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
757 // compare reference counts from BamReader ( & container size, in case of BR error)
758 if ( (currentReaderRefCount != firstReaderRefCount) ||
759 (firstReaderRefSize != currentReaderRefSize) )
761 cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename()
762 << " expected " << firstReaderRefCount
763 << " reference sequences but only found " << currentReaderRefCount << endl;
767 // this will be ok; we just checked above that we have identically-sized sets of references
768 // here we simply check if they are all, in fact, equal in content
769 while ( firstRefIter != firstRefEnd ) {
770 const RefData& firstRef = (*firstRefIter);
771 const RefData& currentRef = (*currentRefIter);
773 // compare reference name & length
774 if ( (firstRef.RefName != currentRef.RefName) ||
775 (firstRef.RefLength != currentRef.RefLength) )
777 cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename()
778 << " expected: " << endl;
780 // print first reader's reference data
781 RefVector::const_iterator refIter = firstReaderRefData.begin();
782 RefVector::const_iterator refEnd = firstReaderRefData.end();
783 for ( ; refIter != refEnd; ++refIter ) {
784 const RefData& entry = (*refIter);
785 cerr << entry.RefName << " " << entry.RefLength << endl;
788 cerr << "but found: " << endl;
790 // print current reader's reference data
791 refIter = currentReaderRefData.begin();
792 refEnd = currentReaderRefData.end();
793 for ( ; refIter != refEnd; ++refIter ) {
794 const RefData& entry = (*refIter);
795 cerr << entry.RefName << " " << entry.RefLength << endl;