1 // ***************************************************************************
2 // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 9 September 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Functionality for simultaneously reading multiple BAM files
8 // *************************************************************************
10 #include <api/BamAlignment.h>
11 #include <api/BamMultiReader.h>
12 #include <api/internal/BamMultiMerger_p.h>
13 #include <api/internal/BamMultiReader_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
25 BamMultiReaderPrivate::BamMultiReaderPrivate(void)
27 , m_sortOrder(BamMultiReader::SortedByPosition)
31 BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
33 // close all open BAM readers
36 // clean up alignment cache
41 // close all BAM files
42 void BamMultiReaderPrivate::Close(void) {
43 CloseFiles( Filenames() );
46 // close requested BAM file
47 void BamMultiReaderPrivate::CloseFile(const string& filename) {
48 vector<string> filenames(1, filename);
49 CloseFiles(filenames);
52 // close requested BAM files
53 void BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
55 // iterate over filenames
56 vector<string>::const_iterator filesIter = filenames.begin();
57 vector<string>::const_iterator filesEnd = filenames.end();
58 for ( ; filesIter != filesEnd; ++filesIter ) {
59 const string& filename = (*filesIter);
60 if ( filename.empty() ) continue;
62 // iterate over readers
63 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
64 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
65 for ( ; readerIter != readerEnd; ++readerIter ) {
66 BamReader* reader = (*readerIter).first;
67 if ( reader == 0 ) continue;
69 // if reader matches requested filename
70 if ( reader->GetFilename() == filename ) {
72 // remove reader/alignment from alignment cache
73 m_alignments->Remove(reader);
75 // close & delete reader
80 // delete reader's alignment entry
81 BamAlignment* alignment = (*readerIter).second;
85 // remove reader from container
86 m_readers.erase(readerIter);
88 // on match, just go on to next filename
89 // (no need to keep looking and iterator is invalid now anyway)
95 // make sure alignment cache is cleared if all readers are now closed
96 if ( m_readers.empty() && m_alignments != 0 )
97 m_alignments->Clear();
100 // creates index files for BAM files that don't have them
101 bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) {
105 // iterate over readers
106 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
107 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
108 for ( ; readerIter != readerEnd; ++readerIter ) {
109 BamReader* reader = (*readerIter).first;
110 if ( reader == 0 ) continue;
112 // if reader doesn't have an index, create one
113 if ( !reader->HasIndex() )
114 result &= reader->CreateIndex(type);
120 IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const {
121 switch ( m_sortOrder ) {
122 case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger;
123 case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger;
124 case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger;
126 cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl;
131 const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const {
133 string readGroup("");
134 stringstream headerLineSs(headerLine);
137 // parse @RG header line, looking for the ID: tag
138 while( getline(headerLineSs, part, '\t') ) {
139 stringstream partSs(part);
141 getline(partSs, subtag, ':');
142 if ( subtag == "ID" ) {
143 getline(partSs, readGroup, ':');
150 const vector<string> BamMultiReaderPrivate::Filenames(void) const {
152 // init filename container
153 vector<string> filenames;
154 filenames.reserve( m_readers.size() );
156 // iterate over readers
157 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
158 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
159 for ( ; readerIter != readerEnd; ++readerIter ) {
160 const BamReader* reader = (*readerIter).first;
161 if ( reader == 0 ) continue;
163 // store filename if not empty
164 const string filename = reader->GetFilename();
165 if ( !filename.empty() )
166 filenames.push_back( reader->GetFilename() );
173 SamHeader BamMultiReaderPrivate::GetHeader(void) const {
174 string text = GetHeaderText();
175 return SamHeader(text);
178 // makes a virtual, unified header for all the bam files in the multireader
179 string BamMultiReaderPrivate::GetHeaderText(void) const {
181 // TODO: merge SamHeader objects instead of parsing string data (again)
183 // if only one reader is open
184 if ( m_readers.size() == 1 ) {
186 // just return reader's header text
187 const ReaderAlignment& ra = m_readers.front();
188 const BamReader* reader = ra.first;
189 if ( reader ) return reader->GetHeaderText();
195 string mergedHeader("");
196 map<string, bool> readGroups;
198 // foreach extraction entry (each BAM file)
199 vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin();
200 vector<ReaderAlignment>::const_iterator readerIter = readerBegin;
201 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
202 for ( ; readerIter != readerEnd; ++readerIter ) {
203 const BamReader* reader = (*readerIter).first;
204 if ( reader == 0 ) continue;
206 // get header from reader
207 string headerText = reader->GetHeaderText();
208 if ( headerText.empty() ) continue;
210 // store header text in lines
211 map<string, bool> currentFileReadGroups;
212 const vector<string> lines = SplitHeaderText(headerText);
214 // iterate over header lines
215 vector<string>::const_iterator linesIter = lines.begin();
216 vector<string>::const_iterator linesEnd = lines.end();
217 for ( ; linesIter != linesEnd; ++linesIter ) {
219 // get next line from header, skip if empty
220 const string headerLine = (*linesIter);
221 if ( headerLine.empty() ) continue;
223 // if first file, save HD & SQ entries
224 // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ?
225 if ( readerIter == readerBegin ) {
226 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
227 mergedHeader.append(headerLine.c_str());
228 mergedHeader.append(1, '\n');
232 // (for all files) append RG entries if they are unique
233 if ( headerLine.find("@RG") == 0 ) {
235 // extract read group name from line
236 const string readGroup = ExtractReadGroup(headerLine);
238 // make sure not to duplicate @RG entries
239 if ( readGroups.find(readGroup) == readGroups.end() ) {
240 mergedHeader.append(headerLine.c_str() );
241 mergedHeader.append(1, '\n');
242 readGroups[readGroup] = true;
243 currentFileReadGroups[readGroup] = true;
245 // warn iff we are reading one file and discover duplicated @RG tags in the header
246 // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
247 if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) {
248 cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup
249 << " entry in header of " << reader->GetFilename() << endl;
256 // return merged header text
260 // get next alignment among all files
261 bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
262 return PopNextCachedAlignment(al, true);
265 // get next alignment among all files without parsing character data from alignments
266 bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
267 return PopNextCachedAlignment(al, false);
270 // ---------------------------------------------------------------------------------------
272 // NB: The following GetReferenceX() functions assume that we have identical
273 // references for all BAM files. We enforce this by invoking the
274 // ValidateReaders() method to verify that our reference data is the same
275 // across all files on Open - so we will not encounter a situation in which
276 // there is a mismatch and we are still live.
278 // ---------------------------------------------------------------------------------------
280 // returns the number of reference sequences
281 int BamMultiReaderPrivate::GetReferenceCount(void) const {
283 // handle empty multireader
284 if ( m_readers.empty() )
287 // return reference count from first reader
288 const ReaderAlignment& ra = m_readers.front();
289 const BamReader* reader = ra.first;
290 if ( reader ) return reader->GetReferenceCount();
296 // returns vector of reference objects
297 const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
299 // handle empty multireader
300 if ( m_readers.empty() )
303 // return reference data from first BamReader
304 const ReaderAlignment& ra = m_readers.front();
305 const BamReader* reader = ra.first;
306 if ( reader ) return reader->GetReferenceData();
312 // returns refID from reference name
313 int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
315 // handle empty multireader
316 if ( m_readers.empty() )
319 // return reference ID from first BamReader
320 const ReaderAlignment& ra = m_readers.front();
321 const BamReader* reader = ra.first;
322 if ( reader ) return reader->GetReferenceID(refName);
327 // ---------------------------------------------------------------------------------------
329 // checks if any readers still have alignments
330 bool BamMultiReaderPrivate::HasAlignmentData(void) const {
331 if ( m_alignments == 0 )
333 return !m_alignments->IsEmpty();
336 // returns true if all readers have index data available
337 // this is useful to indicate whether Jump() or SetRegion() are possible
338 bool BamMultiReaderPrivate::HasIndexes(void) const {
340 // handle empty multireader
341 if ( m_readers.empty() )
346 // iterate over readers
347 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
348 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
349 for ( ; readerIter != readerEnd; ++readerIter ) {
350 const BamReader* reader = (*readerIter).first;
351 if ( reader == 0 ) continue;
353 // see if current reader has index data
354 result &= reader->HasIndex();
360 // returns true if multireader has open readers
361 bool BamMultiReaderPrivate::HasOpenReaders(void) {
363 // iterate over readers
364 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
365 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
366 for ( ; readerIter != readerEnd; ++readerIter ) {
367 const BamReader* reader = (*readerIter).first;
368 if ( reader == 0 ) continue;
370 // return true whenever an open reader is found
371 if ( reader->IsOpen() ) return true;
378 // performs random-access jump using (refID, position) as a left-bound
379 bool BamMultiReaderPrivate::Jump(int refID, int position) {
381 // NB: While it may make sense to track readers in which we can
382 // successfully Jump, in practice a failure of Jump means "no
383 // alignments here." It makes sense to simply accept the failure,
384 // UpdateAlignments(), and continue.
386 // iterate over readers
387 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
388 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
389 for ( ; readerIter != readerEnd; ++readerIter ) {
390 BamReader* reader = (*readerIter).first;
391 if ( reader == 0 ) continue;
393 // attempt jump() on each
394 if ( !reader->Jump(refID, position) ) {
395 cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename()
396 << " to " << refID << ":" << position << endl;
400 // update alignment cache & return success
401 UpdateAlignmentCache();
405 bool BamMultiReaderPrivate::LoadNextAlignment(BamReader* reader, BamAlignment* alignment) {
406 // lazy building of alignment's char data,
407 // only populated on demand by sorting merger or client call to GetNextAlignment()
408 return reader->GetNextAlignmentCore(*alignment);
411 // locate (& load) index files for BAM readers that don't already have one loaded
412 bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) {
416 // iterate over readers
417 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
418 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
419 for ( ; readerIter != readerEnd; ++readerIter ) {
420 BamReader* reader = (*readerIter).first;
421 if ( reader == 0 ) continue;
423 // if reader has no index, try to locate one
424 if ( !reader->HasIndex() )
425 result &= reader->LocateIndex(preferredType);
432 bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
434 // create alignment cache if neccessary
435 if ( m_alignments == 0 ) {
436 m_alignments = CreateMergerForCurrentSortOrder();
437 if ( m_alignments == 0 ) return false;
440 // put all current readers back at beginning (refreshes alignment cache)
443 // iterate over filenames
444 vector<string>::const_iterator filenameIter = filenames.begin();
445 vector<string>::const_iterator filenameEnd = filenames.end();
446 for ( ; filenameIter != filenameEnd; ++filenameIter ) {
447 const string& filename = (*filenameIter);
448 if ( filename.empty() ) continue;
450 // attempt to open BamReader on filename
451 bool openedOk = false;
452 ReaderAlignment ra = OpenReader(filename, &openedOk);
454 m_readers.push_back(ra); // store reader/alignment in local list
455 m_alignments->Add(ra); // add reader/alignment to sorting cache
459 // if more than one reader open, check for reference consistency
460 if ( m_readers.size() > 1 )
467 bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
468 vector<string> filenames(1, filename);
469 return Open(filenames);
472 bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) {
474 // TODO: This needs to be cleaner - should not assume same order.
475 // And either way, shouldn't start at first reader. Should start at
476 // first reader without an index?
478 // make sure same number of index filenames as readers
479 if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() )
485 // iterate over BamReaders
486 vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
487 vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
488 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
489 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
490 for ( ; readerIter != readerEnd; ++readerIter ) {
491 BamReader* reader = (*readerIter).first;
493 // open index filename on reader
495 const string& indexFilename = (*indexFilenameIter);
496 result &= reader->OpenIndex(indexFilename);
499 // increment filename iterator, skip if no more index files to open
500 if ( ++indexFilenameIter == indexFilenameEnd )
504 // TODO: validation ??
506 // return success/fail
510 ReaderAlignment BamMultiReaderPrivate::OpenReader(const string& filename, bool* ok) {
515 // create new BamReader & BamAlignment
516 BamReader* reader = new BamReader;
517 BamAlignment* alignment = new BamAlignment;
519 // if reader opens OK
520 if ( reader->Open(filename) ) {
522 // if first alignment reads OK
523 if ( LoadNextAlignment(reader, alignment) ) {
525 return make_pair(reader, alignment);
528 // could not read alignment
530 cerr << "BamMultiReader WARNING: Could not read first alignment from "
531 << filename << ", ignoring file" << endl;
535 // reader could not open
537 cerr << "BamMultiReader WARNING: Could not open "
538 << filename << ", ignoring file" << endl;
541 // if we get here, there was a problem with this BAM file (opening or reading)
542 // clean up memory allocation & return null pointer
545 return ReaderAlignment();
548 // print associated filenames to stdout
549 void BamMultiReaderPrivate::PrintFilenames(void) const {
550 const vector<string>& filenames = Filenames();
551 vector<string>::const_iterator filenameIter = filenames.begin();
552 vector<string>::const_iterator filenameEnd = filenames.end();
553 for ( ; filenameIter != filenameEnd; ++filenameIter )
554 cout << (*filenameIter) << endl;
557 bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
559 // bail out if no more data to process
560 if ( !HasAlignmentData() )
563 // pop next reader/alignment pair
564 ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst();
565 BamReader* reader = nextReaderAlignment.first;
566 BamAlignment* alignment = nextReaderAlignment.second;
568 // store cached alignment into destination parameter (by copy)
571 // set char data if requested
572 if ( needCharData ) {
574 al.Filename = reader->GetFilename();
577 // load next alignment from reader & store in cache
578 SaveNextAlignment(reader, alignment);
584 // returns BAM file pointers to beginning of alignment data & resets alignment cache
585 bool BamMultiReaderPrivate::Rewind(void) {
587 // clear out alignment cache
588 m_alignments->Clear();
590 // attempt to rewind files
591 if ( !RewindReaders() ) {
592 cerr << "BamMultiReader ERROR: could not rewind file(s) successfully";
596 // reset cache & return success
597 UpdateAlignmentCache();
601 // returns BAM file pointers to beginning of alignment data
602 bool BamMultiReaderPrivate::RewindReaders(void) {
606 // iterate over readers
607 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
608 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
609 for ( ; readerIter != readerEnd; ++readerIter ) {
610 BamReader* reader = (*readerIter).first;
611 if ( reader == 0 ) continue;
613 // attempt rewind on BamReader
614 result &= reader->Rewind();
620 void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
622 // if can read alignment from reader, store in cache
623 if ( LoadNextAlignment(reader, alignment) )
624 m_alignments->Add( make_pair(reader, alignment) );
627 // sets the index caching mode on the readers
628 void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) {
630 // iterate over readers
631 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
632 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
633 for ( ; readerIter != readerEnd; ++readerIter ) {
634 BamReader* reader = (*readerIter).first;
635 if ( reader == 0 ) continue;
637 // set reader's index cache mode
638 reader->SetIndexCacheMode(mode);
642 bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
644 // NB: While it may make sense to track readers in which we can
645 // successfully SetRegion, In practice a failure of SetRegion means "no
646 // alignments here." It makes sense to simply accept the failure,
647 // UpdateAlignments(), and continue.
649 // iterate over alignments
650 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
651 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
652 for ( ; readerIter != readerEnd; ++readerIter ) {
653 BamReader* reader = (*readerIter).first;
654 if ( reader == 0 ) continue;
656 // attempt to set BamReader's region of interest
657 if ( !reader->SetRegion(region) ) {
658 cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to "
659 << region.LeftRefID << ":" << region.LeftPosition << ".."
660 << region.RightRefID << ":" << region.RightPosition << endl;
664 // update alignment cache & return success
665 UpdateAlignmentCache();
669 void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) {
671 // skip if no change needed
672 if ( m_sortOrder == order ) return;
674 // set new sort order
677 // create new alignment cache based on sort order
678 IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder();
679 if ( newAlignmentCache == 0 ) return; // print error?
681 // copy old cache contents to new cache
682 while ( m_alignments->Size() > 0 ) {
683 ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops'
684 newAlignmentCache->Add(value);
687 // remove old cache structure & point to new cache
689 m_alignments = newAlignmentCache;
692 // splits the entire header into a list of strings
693 const vector<string> BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const {
695 stringstream header(headerText);
698 vector<string> lines;
699 while ( getline(header, item) )
700 lines.push_back(item);
704 // updates our alignment cache
705 void BamMultiReaderPrivate::UpdateAlignmentCache(void) {
707 // skip if invalid alignment cache
708 if ( m_alignments == 0 ) return;
711 m_alignments->Clear();
713 // iterate over readers
714 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
715 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
716 for ( ; readerIter != readerEnd; ++readerIter ) {
717 BamReader* reader = (*readerIter).first;
718 BamAlignment* alignment = (*readerIter).second;
719 if ( reader == 0 || alignment == 0 ) continue;
721 // save next alignment from each reader in cache
722 SaveNextAlignment(reader, alignment);
726 // ValidateReaders checks that all the readers point to BAM files representing
727 // alignments against the same set of reference sequences, and that the
728 // sequences are identically ordered. If these checks fail the operation of
729 // the multireader is undefined, so we force program exit.
730 void BamMultiReaderPrivate::ValidateReaders(void) const {
732 // retrieve first reader data
733 const BamReader* firstReader = m_readers.front().first;
734 if ( firstReader == 0 ) return;
735 const RefVector firstReaderRefData = firstReader->GetReferenceData();
736 const int firstReaderRefCount = firstReader->GetReferenceCount();
737 const int firstReaderRefSize = firstReaderRefData.size();
739 // iterate over all readers
740 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
741 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
742 for ( ; readerIter != readerEnd; ++readerIter ) {
744 // get current reader data
745 BamReader* reader = (*readerIter).first;
746 if ( reader == 0 ) continue;
747 const RefVector currentReaderRefData = reader->GetReferenceData();
748 const int currentReaderRefCount = reader->GetReferenceCount();
749 const int currentReaderRefSize = currentReaderRefData.size();
751 // init container iterators
752 RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
753 RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
754 RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
756 // compare reference counts from BamReader ( & container size, in case of BR error)
757 if ( (currentReaderRefCount != firstReaderRefCount) ||
758 (firstReaderRefSize != currentReaderRefSize) )
760 cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename()
761 << " expected " << firstReaderRefCount
762 << " reference sequences but only found " << currentReaderRefCount << endl;
766 // this will be ok; we just checked above that we have identically-sized sets of references
767 // here we simply check if they are all, in fact, equal in content
768 while ( firstRefIter != firstRefEnd ) {
769 const RefData& firstRef = (*firstRefIter);
770 const RefData& currentRef = (*currentRefIter);
772 // compare reference name & length
773 if ( (firstRef.RefName != currentRef.RefName) ||
774 (firstRef.RefLength != currentRef.RefLength) )
776 cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename()
777 << " expected: " << endl;
779 // print first reader's reference data
780 RefVector::const_iterator refIter = firstReaderRefData.begin();
781 RefVector::const_iterator refEnd = firstReaderRefData.end();
782 for ( ; refIter != refEnd; ++refIter ) {
783 const RefData& entry = (*refIter);
784 cerr << entry.RefName << " " << entry.RefLength << endl;
787 cerr << "but found: " << endl;
789 // print current reader's reference data
790 refIter = currentReaderRefData.begin();
791 refEnd = currentReaderRefData.end();
792 for ( ; refIter != refEnd; ++refIter ) {
793 const RefData& entry = (*refIter);
794 cerr << entry.RefName << " " << entry.RefLength << endl;