1 // ***************************************************************************
2 // BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 5 April 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Functionality for simultaneously reading multiple BAM files
9 // *************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/BamMultiReader.h>
13 #include <api/internal/BamMultiMerger_p.h>
14 #include <api/internal/BamMultiReader_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
26 BamMultiReaderPrivate::BamMultiReaderPrivate(void)
29 , m_sortOrder(BamMultiReader::SortedByPosition)
33 BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
35 // close all open BAM readers
38 // clean up alignment cache
43 // close all BAM files
44 void BamMultiReaderPrivate::Close(void) {
45 CloseFiles( Filenames() );
48 // close requested BAM file
49 void BamMultiReaderPrivate::CloseFile(const string& filename) {
50 vector<string> filenames(1, filename);
51 CloseFiles(filenames);
54 // close requested BAM files
55 void BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
57 // iterate over filenames
58 vector<string>::const_iterator filesIter = filenames.begin();
59 vector<string>::const_iterator filesEnd = filenames.end();
60 for ( ; filesIter != filesEnd; ++filesIter ) {
61 const string& filename = (*filesIter);
62 if ( filename.empty() ) continue;
64 // iterate over readers
65 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
66 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
67 for ( ; readerIter != readerEnd; ++readerIter ) {
68 BamReader* reader = (*readerIter).first;
69 if ( reader == 0 ) continue;
71 // if reader matches requested filename
72 if ( reader->GetFilename() == filename ) {
74 // remove reader/alignment from alignment cache
75 m_alignments->Remove(reader);
77 // close & delete reader
82 // delete reader's alignment entry
83 BamAlignment* alignment = (*readerIter).second;
87 // remove reader from container
88 m_readers.erase(readerIter);
90 // on match, just go on to next filename
91 // (no need to keep looking and iterator is invalid now anyway)
97 // make sure alignment cache is cleared if all readers are now closed
98 if ( m_readers.empty() && m_alignments != 0 )
99 m_alignments->Clear();
102 // creates index files for BAM files that don't have them
103 bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) {
107 // iterate over readers
108 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
109 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
110 for ( ; readerIter != readerEnd; ++readerIter ) {
111 BamReader* reader = (*readerIter).first;
112 if ( reader == 0 ) continue;
114 // if reader doesn't have an index, create one
115 if ( !reader->HasIndex() )
116 result &= reader->CreateIndex(type);
122 IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const {
123 switch ( m_sortOrder ) {
124 case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger;
125 case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger;
126 case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger;
128 cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl;
133 const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const {
135 string readGroup("");
136 stringstream headerLineSs(headerLine);
139 // parse @RG header line, looking for the ID: tag
140 while( getline(headerLineSs, part, '\t') ) {
141 stringstream partSs(part);
143 getline(partSs, subtag, ':');
144 if ( subtag == "ID" ) {
145 getline(partSs, readGroup, ':');
152 const vector<string> BamMultiReaderPrivate::Filenames(void) const {
154 // init filename container
155 vector<string> filenames;
156 filenames.reserve( m_readers.size() );
158 // iterate over readers
159 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
160 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
161 for ( ; readerIter != readerEnd; ++readerIter ) {
162 const BamReader* reader = (*readerIter).first;
163 if ( reader == 0 ) continue;
165 // store filename if not empty
166 const string filename = reader->GetFilename();
167 if ( !filename.empty() )
168 filenames.push_back( reader->GetFilename() );
175 SamHeader BamMultiReaderPrivate::GetHeader(void) const {
176 string text = GetHeaderText();
177 return SamHeader(text);
180 // makes a virtual, unified header for all the bam files in the multireader
181 string BamMultiReaderPrivate::GetHeaderText(void) const {
183 // TODO: merge SamHeader objects instead of parsing string data (again)
185 // if only one reader is open
186 if ( m_readers.size() == 1 ) {
188 // just return reader's header text
189 const ReaderAlignment& ra = m_readers.front();
190 const BamReader* reader = ra.first;
191 if ( reader ) return reader->GetHeaderText();
197 string mergedHeader("");
198 map<string, bool> readGroups;
200 // foreach extraction entry (each BAM file)
201 vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin();
202 vector<ReaderAlignment>::const_iterator readerIter = readerBegin;
203 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
204 for ( ; readerIter != readerEnd; ++readerIter ) {
205 const BamReader* reader = (*readerIter).first;
206 if ( reader == 0 ) continue;
208 // get header from reader
209 string headerText = reader->GetHeaderText();
210 if ( headerText.empty() ) continue;
212 // store header text in lines
213 map<string, bool> currentFileReadGroups;
214 const vector<string> lines = SplitHeaderText(headerText);
216 // iterate over header lines
217 vector<string>::const_iterator linesIter = lines.begin();
218 vector<string>::const_iterator linesEnd = lines.end();
219 for ( ; linesIter != linesEnd; ++linesIter ) {
221 // get next line from header, skip if empty
222 const string headerLine = (*linesIter);
223 if ( headerLine.empty() ) continue;
225 // if first file, save HD & SQ entries
226 // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ?
227 if ( readerIter == readerBegin ) {
228 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
229 mergedHeader.append(headerLine.c_str());
230 mergedHeader.append(1, '\n');
234 // (for all files) append RG entries if they are unique
235 if ( headerLine.find("@RG") == 0 ) {
237 // extract read group name from line
238 const string readGroup = ExtractReadGroup(headerLine);
240 // make sure not to duplicate @RG entries
241 if ( readGroups.find(readGroup) == readGroups.end() ) {
242 mergedHeader.append(headerLine.c_str() );
243 mergedHeader.append(1, '\n');
244 readGroups[readGroup] = true;
245 currentFileReadGroups[readGroup] = true;
247 // warn iff we are reading one file and discover duplicated @RG tags in the header
248 // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
249 if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) {
250 cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup
251 << " entry in header of " << reader->GetFilename() << endl;
258 // return merged header text
262 // get next alignment among all files
263 bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
264 m_isCoreMode = false;
265 return LoadNextAlignment(al);
268 // get next alignment among all files without parsing character data from alignments
269 bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
271 return LoadNextAlignment(al);
274 // ---------------------------------------------------------------------------------------
276 // NB: The following GetReferenceX() functions assume that we have identical
277 // references for all BAM files. We enforce this by invoking the
278 // ValidateReaders() method to verify that our reference data is the same
279 // across all files on Open - so we will not encounter a situation in which
280 // there is a mismatch and we are still live.
282 // ---------------------------------------------------------------------------------------
284 // returns the number of reference sequences
285 int BamMultiReaderPrivate::GetReferenceCount(void) const {
287 // handle empty multireader
288 if ( m_readers.empty() )
291 // return reference count from first reader
292 const ReaderAlignment& ra = m_readers.front();
293 const BamReader* reader = ra.first;
294 if ( reader ) return reader->GetReferenceCount();
300 // returns vector of reference objects
301 const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
303 // handle empty multireader
304 if ( m_readers.empty() )
307 // return reference data from first BamReader
308 const ReaderAlignment& ra = m_readers.front();
309 const BamReader* reader = ra.first;
310 if ( reader ) return reader->GetReferenceData();
316 // returns refID from reference name
317 int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
319 // handle empty multireader
320 if ( m_readers.empty() )
323 // return reference ID from first BamReader
324 const ReaderAlignment& ra = m_readers.front();
325 const BamReader* reader = ra.first;
326 if ( reader ) return reader->GetReferenceID(refName);
331 // ---------------------------------------------------------------------------------------
333 // checks if any readers still have alignments
334 bool BamMultiReaderPrivate::HasAlignmentData(void) const {
335 if ( m_alignments == 0 )
337 return !m_alignments->IsEmpty();
340 // returns true if all readers have index data available
341 // this is useful to indicate whether Jump() or SetRegion() are possible
342 bool BamMultiReaderPrivate::HasIndexes(void) const {
344 // handle empty multireader
345 if ( m_readers.empty() )
350 // iterate over readers
351 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
352 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
353 for ( ; readerIter != readerEnd; ++readerIter ) {
354 const BamReader* reader = (*readerIter).first;
355 if ( reader == 0 ) continue;
357 // see if current reader has index data
358 result &= reader->HasIndex();
364 // returns true if multireader has open readers
365 bool BamMultiReaderPrivate::HasOpenReaders(void) {
367 // iterate over readers
368 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
369 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
370 for ( ; readerIter != readerEnd; ++readerIter ) {
371 const BamReader* reader = (*readerIter).first;
372 if ( reader == 0 ) continue;
374 // return true whenever an open reader is found
375 if ( reader->IsOpen() ) return true;
382 // performs random-access jump using (refID, position) as a left-bound
383 bool BamMultiReaderPrivate::Jump(int refID, int position) {
385 // NB: While it may make sense to track readers in which we can
386 // successfully Jump, in practice a failure of Jump means "no
387 // alignments here." It makes sense to simply accept the failure,
388 // UpdateAlignments(), and continue.
390 // iterate over readers
391 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
392 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
393 for ( ; readerIter != readerEnd; ++readerIter ) {
394 BamReader* reader = (*readerIter).first;
395 if ( reader == 0 ) continue;
397 // attempt jump() on each
398 if ( !reader->Jump(refID, position) ) {
399 cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename()
400 << " to " << refID << ":" << position << endl;
404 // update alignment cache & return success
405 UpdateAlignmentCache();
409 bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al) {
411 // bail out if no more data to process
412 if ( !HasAlignmentData() )
415 // "pop" next alignment and reader
416 ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst();
417 BamReader* reader = nextReaderAlignment.first;
418 BamAlignment* alignment = nextReaderAlignment.second;
420 // store cached alignment into destination parameter (by copy)
423 // peek to next alignment & store in cache
424 SaveNextAlignment(reader, alignment);
430 // locate (& load) index files for BAM readers that don't already have one loaded
431 bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) {
435 // iterate over readers
436 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
437 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
438 for ( ; readerIter != readerEnd; ++readerIter ) {
439 BamReader* reader = (*readerIter).first;
440 if ( reader == 0 ) continue;
442 // if reader has no index, try to locate one
443 if ( !reader->HasIndex() )
444 result &= reader->LocateIndex(preferredType);
451 bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
453 // create alignment cache if neccessary
454 if ( m_alignments == 0 ) {
455 m_alignments = CreateMergerForCurrentSortOrder();
456 if ( m_alignments == 0 ) return false;
459 // iterate over filenames
460 vector<string>::const_iterator filenameIter = filenames.begin();
461 vector<string>::const_iterator filenameEnd = filenames.end();
462 for ( ; filenameIter != filenameEnd; ++filenameIter ) {
463 const string& filename = (*filenameIter);
464 if ( filename.empty() ) continue;
466 // attempt to open BamReader on filename
467 BamReader* reader = OpenReader(filename);
468 if ( reader == 0 ) continue;
470 // store reader with new alignment
471 m_readers.push_back( make_pair(reader, new BamAlignment) );
474 // validate & rewind any opened readers, also refreshes alignment cache
475 if ( !m_readers.empty() ) {
484 bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
485 vector<string> filenames(1, filename);
486 return Open(filenames);
489 bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) {
491 // TODO: This needs to be cleaner - should not assume same order.
492 // And either way, shouldn't start at first reader. Should start at
493 // first reader without an index?
495 // make sure same number of index filenames as readers
496 if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() )
502 // iterate over BamReaders
503 vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
504 vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
505 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
506 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
507 for ( ; readerIter != readerEnd; ++readerIter ) {
508 BamReader* reader = (*readerIter).first;
510 // open index filename on reader
512 const string& indexFilename = (*indexFilenameIter);
513 result &= reader->OpenIndex(indexFilename);
516 // increment filename iterator, skip if no more index files to open
517 if ( ++indexFilenameIter == indexFilenameEnd )
521 // TODO: validation ??
523 // return success/fail
527 BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) {
529 // create new BamReader
530 BamReader* reader = new BamReader;
532 // if reader opens OK
533 if ( reader->Open(filename) ) {
535 // attempt to read first alignment (sanity check)
536 // if ok, then return BamReader pointer
538 if ( reader->GetNextAlignmentCore(al) )
541 // could not read alignment
543 cerr << "BamMultiReader WARNING: Could not read first alignment from "
544 << filename << ", ignoring file" << endl;
548 // reader could not open
550 cerr << "BamMultiReader WARNING: Could not open "
551 << filename << ", ignoring file" << endl;
554 // if we get here, there was a problem with this BAM file (opening or reading)
555 // clean up memory allocation & return null pointer
560 // print associated filenames to stdout
561 void BamMultiReaderPrivate::PrintFilenames(void) const {
562 const vector<string>& filenames = Filenames();
563 vector<string>::const_iterator filenameIter = filenames.begin();
564 vector<string>::const_iterator filenameEnd = filenames.end();
565 for ( ; filenameIter != filenameEnd; ++filenameIter )
566 cout << (*filenameIter) << endl;
569 // returns BAM file pointers to beginning of alignment data & resets alignment cache
570 bool BamMultiReaderPrivate::Rewind(void) {
572 // clear out alignment cache
573 m_alignments->Clear();
575 // attempt to rewind files
576 if ( !RewindReaders() ) {
577 cerr << "BamMultiReader ERROR: could not rewind file(s) successfully";
581 // reset cache & return success
582 UpdateAlignmentCache();
586 // returns BAM file pointers to beginning of alignment data
587 bool BamMultiReaderPrivate::RewindReaders(void) {
591 // iterate over readers
592 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
593 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
594 for ( ; readerIter != readerEnd; ++readerIter ) {
595 BamReader* reader = (*readerIter).first;
596 if ( reader == 0 ) continue;
598 // attempt rewind on BamReader
599 result &= reader->Rewind();
605 void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
607 // must be in core mode && NOT sorting by read name to call GNACore()
608 if ( m_isCoreMode && m_sortOrder != BamMultiReader::SortedByReadName ) {
609 if ( reader->GetNextAlignmentCore(*alignment) )
610 m_alignments->Add( make_pair(reader, alignment) );
613 // not in core mode and/or sorting by readname, must call GNA()
615 if ( reader->GetNextAlignment(*alignment) )
616 m_alignments->Add( make_pair(reader, alignment) );
620 // sets the index caching mode on the readers
621 void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) {
623 // iterate over readers
624 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
625 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
626 for ( ; readerIter != readerEnd; ++readerIter ) {
627 BamReader* reader = (*readerIter).first;
628 if ( reader == 0 ) continue;
630 // set reader's index cache mode
631 reader->SetIndexCacheMode(mode);
635 bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
637 // NB: While it may make sense to track readers in which we can
638 // successfully SetRegion, In practice a failure of SetRegion means "no
639 // alignments here." It makes sense to simply accept the failure,
640 // UpdateAlignments(), and continue.
642 // iterate over alignments
643 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
644 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
645 for ( ; readerIter != readerEnd; ++readerIter ) {
646 BamReader* reader = (*readerIter).first;
647 if ( reader == 0 ) continue;
649 // attempt to set BamReader's region of interest
650 if ( !reader->SetRegion(region) ) {
651 cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to "
652 << region.LeftRefID << ":" << region.LeftPosition << ".."
653 << region.RightRefID << ":" << region.RightPosition << endl;
657 // update alignment cache & return success
658 UpdateAlignmentCache();
662 void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) {
664 // skip if no change needed
665 if ( m_sortOrder == order ) return;
667 // set new sort order
670 // create new alignment cache based on sort order
671 IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder();
672 if ( newAlignmentCache == 0 ) return; // print error?
674 // copy old cache contents to new cache
675 while ( m_alignments->Size() > 0 ) {
676 ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops'
677 newAlignmentCache->Add(value);
680 // remove old cache structure & point to new cache
682 m_alignments = newAlignmentCache;
685 // splits the entire header into a list of strings
686 const vector<string> BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const {
688 stringstream header(headerText);
691 vector<string> lines;
692 while ( getline(header, item) )
693 lines.push_back(item);
697 // updates our alignment cache
698 void BamMultiReaderPrivate::UpdateAlignmentCache(void) {
700 // skip if invalid alignment cache
701 if ( m_alignments == 0 ) return;
704 m_alignments->Clear();
706 // seed cache with fully-populated alignments
707 // further updates will fill with full/core-only as requested
708 m_isCoreMode = false;
710 // iterate over readers
711 vector<ReaderAlignment>::iterator readerIter = m_readers.begin();
712 vector<ReaderAlignment>::iterator readerEnd = m_readers.end();
713 for ( ; readerIter != readerEnd; ++readerIter ) {
714 BamReader* reader = (*readerIter).first;
715 BamAlignment* alignment = (*readerIter).second;
716 if ( reader == 0 || alignment == 0 ) continue;
718 // save next alignment from each reader in cache
719 SaveNextAlignment(reader, alignment);
723 // ValidateReaders checks that all the readers point to BAM files representing
724 // alignments against the same set of reference sequences, and that the
725 // sequences are identically ordered. If these checks fail the operation of
726 // the multireader is undefined, so we force program exit.
727 void BamMultiReaderPrivate::ValidateReaders(void) const {
729 // retrieve first reader data
730 const BamReader* firstReader = m_readers.front().first;
731 if ( firstReader == 0 ) return;
732 const RefVector firstReaderRefData = firstReader->GetReferenceData();
733 const int firstReaderRefCount = firstReader->GetReferenceCount();
734 const int firstReaderRefSize = firstReaderRefData.size();
736 // iterate over all readers
737 vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin();
738 vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end();
739 for ( ; readerIter != readerEnd; ++readerIter ) {
741 // get current reader data
742 BamReader* reader = (*readerIter).first;
743 if ( reader == 0 ) continue;
744 const RefVector currentReaderRefData = reader->GetReferenceData();
745 const int currentReaderRefCount = reader->GetReferenceCount();
746 const int currentReaderRefSize = currentReaderRefData.size();
748 // init container iterators
749 RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
750 RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
751 RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
753 // compare reference counts from BamReader ( & container size, in case of BR error)
754 if ( (currentReaderRefCount != firstReaderRefCount) ||
755 (firstReaderRefSize != currentReaderRefSize) )
757 cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename()
758 << " expected " << firstReaderRefCount
759 << " reference sequences but only found " << currentReaderRefCount << endl;
763 // this will be ok; we just checked above that we have identically-sized sets of references
764 // here we simply check if they are all, in fact, equal in content
765 while ( firstRefIter != firstRefEnd ) {
766 const RefData& firstRef = (*firstRefIter);
767 const RefData& currentRef = (*currentRefIter);
769 // compare reference name & length
770 if ( (firstRef.RefName != currentRef.RefName) ||
771 (firstRef.RefLength != currentRef.RefLength) )
773 cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename()
774 << " expected: " << endl;
776 // print first reader's reference data
777 RefVector::const_iterator refIter = firstReaderRefData.begin();
778 RefVector::const_iterator refEnd = firstReaderRefData.end();
779 for ( ; refIter != refEnd; ++refIter ) {
780 const RefData& entry = (*refIter);
781 cerr << entry.RefName << " " << entry.RefLength << endl;
784 cerr << "but found: " << endl;
786 // print current reader's reference data
787 refIter = currentReaderRefData.begin();
788 refEnd = currentReaderRefData.end();
789 for ( ; refIter != refEnd; ++refIter ) {
790 const RefData& entry = (*refIter);
791 cerr << entry.RefName << " " << entry.RefLength << endl;