src/api/BamMultiReader.cpp

   1 // ***************************************************************************
   2 // BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
   3 // Marth Lab, Department of Biology, Boston College
   4 // All rights reserved.
   5 // ---------------------------------------------------------------------------
   6 // Last modified: 19 November 2010 (DB)
   7 // ---------------------------------------------------------------------------
   8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
   9 // Institute.
  10 // ---------------------------------------------------------------------------
  11 // Functionality for simultaneously reading multiple BAM files.
  12 //
  13 // This functionality allows applications to work on very large sets of files
  14 // without requiring intermediate merge, sort, and index steps for each file
  15 // subset.  It also improves the performance of our merge system as it
  16 // precludes the need to sort merged files.
  17 // ***************************************************************************
  18
  19 #include <api/BamMultiReader.h>
  20 #include <api/BGZF.h>
  21 using namespace BamTools;
  22
  23 #include <algorithm>
  24 #include <fstream>
  25 #include <iostream>
  26 #include <iterator>
  27 #include <sstream>
  28 #include <string>
  29 #include <vector>
  30 using namespace std;
  31
  32 // -----------------------------------------------------
  33 // BamMultiReader implementation
  34 // -----------------------------------------------------
  35
  36 // constructor
  37 BamMultiReader::BamMultiReader(void)
  38     : CurrentRefID(0)
  39     , CurrentLeft(0)
  40 { }
  41
  42 // destructor
  43 BamMultiReader::~BamMultiReader(void) {
  44     Close();
  45 }
  46
  47 // close the BAM files
  48 void BamMultiReader::Close(void) {
  49
  50     // close all BAM readers and clean up pointers
  51     vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin();
  52     vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd  = readers.end();
  53     for ( ; readerIter != readerEnd; ++readerIter) {
  54
  55         BamReader* reader = (*readerIter).first;
  56         BamAlignment* alignment = (*readerIter).second;
  57
  58         // close the reader
  59         if ( reader) reader->Close();
  60
  61         // delete reader pointer
  62         delete reader;
  63         reader = 0;
  64
  65         // delete alignment pointer
  66         delete alignment;
  67         alignment = 0;
  68     }
  69
  70     // clear out the container
  71     readers.clear();
  72 }
  73
  74 // saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
  75 bool BamMultiReader::CreateIndexes(bool useStandardIndex) {
  76     bool result = true;
  77     for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
  78         BamReader* reader = it->first;
  79         result &= reader->CreateIndex(useStandardIndex);
  80     }
  81     return result;
  82 }
  83
  84 // sets the index caching mode on the readers
  85 void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
  86     for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
  87         BamReader* reader = it->first;
  88         reader->SetIndexCacheMode(mode);
  89     }
  90 }
  91
  92 // for debugging
  93 void BamMultiReader::DumpAlignmentIndex(void) {
  94     for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
  95         cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl;
  96     }
  97 }
  98
  99 // makes a virtual, unified header for all the bam files in the multireader
 100 const string BamMultiReader::GetHeaderText(void) const {
 101
 102     string mergedHeader = "";
 103     map<string, bool> readGroups;
 104
 105     // foreach extraction entry (each BAM file)
 106     for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
 107
 108         BamReader* reader = rs->first;
 109         string headerText = reader->GetHeaderText();
 110         if ( headerText.empty() ) continue;
 111
 112         map<string, bool> currentFileReadGroups;
 113         stringstream header(headerText);
 114         vector<string> lines;
 115         string item;
 116         while (getline(header, item))
 117             lines.push_back(item);
 118
 119         for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
 120
 121             // get next line from header, skip if empty
 122             string headerLine = *it;
 123             if ( headerLine.empty() ) { continue; }
 124
 125             // if first file, save HD & SQ entries
 126             if ( rs == readers.begin() ) {
 127                 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
 128                     mergedHeader.append(headerLine.c_str());
 129                     mergedHeader.append(1, '\n');
 130                 }
 131             }
 132
 133             // (for all files) append RG entries if they are unique
 134             if ( headerLine.find("@RG") == 0 ) {
 135                 stringstream headerLineSs(headerLine);
 136                 string part, readGroupPart, readGroup;
 137                 while(std::getline(headerLineSs, part, '\t')) {
 138                     stringstream partSs(part);
 139                     string subtag;
 140                     std::getline(partSs, subtag, ':');
 141                     if (subtag == "ID") {
 142                         std::getline(partSs, readGroup, ':');
 143                         break;
 144                     }
 145                 }
 146                 if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
 147                     mergedHeader.append(headerLine.c_str() );
 148                     mergedHeader.append(1, '\n');
 149                     readGroups[readGroup] = true;
 150                     currentFileReadGroups[readGroup] = true;
 151                 } else {
 152                     // warn iff we are reading one file and discover duplicated @RG tags in the header
 153                     // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
 154                     if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
 155                         cerr << "WARNING: duplicate @RG tag " << readGroup
 156                             << " entry in header of " << reader->GetFilename() << endl;
 157                     }
 158                 }
 159             }
 160         }
 161     }
 162
 163     // return merged header text
 164     return mergedHeader;
 165 }
 166
 167 // get next alignment among all files
 168 bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
 169
 170     // bail out if we are at EOF in all files, means no more alignments to process
 171     if (!HasOpenReaders())
 172         return false;
 173
 174     // when all alignments have stepped into a new target sequence, update our
 175     // current reference sequence id
 176     UpdateReferenceID();
 177
 178     // our lowest alignment and reader will be at the front of our alignment index
 179     BamAlignment* alignment = alignments.begin()->second.second;
 180     BamReader* reader = alignments.begin()->second.first;
 181
 182     // now that we have the lowest alignment in the set, save it by copy to our argument
 183     nextAlignment = BamAlignment(*alignment);
 184
 185     // remove this alignment index entry from our alignment index
 186     alignments.erase(alignments.begin());
 187
 188     // and add another entry if we can get another alignment from the reader
 189     if (reader->GetNextAlignment(*alignment)) {
 190         alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
 191                                     make_pair(reader, alignment)));
 192     } else { // do nothing
 193         //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
 194     }
 195
 196     return true;
 197
 198 }
 199
 200 // get next alignment among all files without parsing character data from alignments
 201 bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
 202
 203     // bail out if we are at EOF in all files, means no more alignments to process
 204     if (!HasOpenReaders())
 205         return false;
 206
 207     // when all alignments have stepped into a new target sequence, update our
 208     // current reference sequence id
 209     UpdateReferenceID();
 210
 211     // our lowest alignment and reader will be at the front of our alignment index
 212     BamAlignment* alignment = alignments.begin()->second.second;
 213     BamReader* reader = alignments.begin()->second.first;
 214
 215     // now that we have the lowest alignment in the set, save it by copy to our argument
 216     nextAlignment = BamAlignment(*alignment);
 217     //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
 218
 219     // remove this alignment index entry from our alignment index
 220     alignments.erase(alignments.begin());
 221
 222     // and add another entry if we can get another alignment from the reader
 223     if (reader->GetNextAlignmentCore(*alignment)) {
 224         alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
 225                                     make_pair(reader, alignment)));
 226     } else { // do nothing
 227         //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
 228     }
 229
 230     return true;
 231
 232 }
 233
 234 // ---------------------------------------------------------------------------------------
 235 //
 236 // NB: The following GetReferenceX() functions assume that we have identical
 237 // references for all BAM files.  We enforce this by invoking the above
 238 // validation function (ValidateReaders) to verify that our reference data
 239 // is the same across all files on Open, so we will not encounter a situation
 240 // in which there is a mismatch and we are still live.
 241 //
 242 // ---------------------------------------------------------------------------------------
 243
 244 // returns the number of reference sequences
 245 const int BamMultiReader::GetReferenceCount(void) const {
 246     return readers.front().first->GetReferenceCount();
 247 }
 248
 249 // returns vector of reference objects
 250 const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
 251     return readers.front().first->GetReferenceData();
 252 }
 253
 254 // returns refID from reference name
 255 const int BamMultiReader::GetReferenceID(const string& refName) const {
 256     return readers.front().first->GetReferenceID(refName);
 257 }
 258
 259 // ---------------------------------------------------------------------------------------
 260
 261 // checks if any readers still have alignments
 262 bool BamMultiReader::HasOpenReaders() {
 263     return alignments.size() > 0;
 264 }
 265
 266 // returns whether underlying BAM readers ALL have an index loaded
 267 // this is useful to indicate whether Jump() or SetRegion() are possible
 268 bool BamMultiReader::IsIndexLoaded(void) const {
 269     bool ok = true;
 270     vector<pair<BamReader*, BamAlignment*> >::const_iterator readerIter = readers.begin();
 271     vector<pair<BamReader*, BamAlignment*> >::const_iterator readerEnd  = readers.end();
 272     for ( ; readerIter != readerEnd; ++readerIter ) {
 273         const BamReader* reader = (*readerIter).first;
 274         if ( reader ) ok &= reader->IsIndexLoaded();
 275     }
 276     return ok;
 277 }
 278
 279 // jumps to specified region(refID, leftBound) in BAM files, returns success/fail
 280 bool BamMultiReader::Jump(int refID, int position) {
 281
 282     //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
 283     CurrentRefID = refID;
 284     CurrentLeft  = position;
 285
 286     bool result = true;
 287     for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
 288         BamReader* reader = it->first;
 289         result &= reader->Jump(refID, position);
 290         if (!result) {
 291             cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
 292             exit(1);
 293         }
 294     }
 295     if (result) UpdateAlignments();
 296     return result;
 297 }
 298
 299 // opens BAM files
 300 bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool preferStandardIndex) {
 301
 302     // for filename in filenames
 303     fileNames = filenames; // save filenames in our multireader
 304     for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
 305
 306         const string filename = *it;
 307         BamReader* reader = new BamReader;
 308
 309         bool openedOK = true;
 310         openedOK = reader->Open(filename, "", openIndexes, preferStandardIndex);
 311
 312         // if file opened ok, check that it can be read
 313         if ( openedOK ) {
 314
 315             bool fileOK = true;
 316             BamAlignment* alignment = new BamAlignment;
 317             fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) );
 318
 319             if (fileOK) {
 320                 readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
 321                 alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
 322                                             make_pair(reader, alignment)));
 323             } else {
 324                 cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
 325                 // if only file available & could not be read, return failure
 326                 if ( filenames.size() == 1 ) return false;
 327             }
 328         }
 329
 330         // TODO; any further error handling when openedOK is false ??
 331         else
 332             return false;
 333     }
 334
 335     // files opened ok, at least one alignment could be read,
 336     // now need to check that all files use same reference data
 337     ValidateReaders();
 338     return true;
 339 }
 340
 341 void BamMultiReader::PrintFilenames(void) {
 342     for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
 343         BamReader* reader = it->first;
 344         cout << reader->GetFilename() << endl;
 345     }
 346 }
 347
 348 // returns BAM file pointers to beginning of alignment data
 349 bool BamMultiReader::Rewind(void) {
 350     bool result = true;
 351     for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
 352         BamReader* reader = it->first;
 353         result &= reader->Rewind();
 354     }
 355     return result;
 356 }
 357
 358 bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
 359     BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
 360     return SetRegion(region);
 361 }
 362
 363 bool BamMultiReader::SetRegion(const BamRegion& region) {
 364
 365     Region = region;
 366
 367     // NB: While it may make sense to track readers in which we can
 368     // successfully SetRegion, In practice a failure of SetRegion means "no
 369     // alignments here."  It makes sense to simply accept the failure,
 370     // UpdateAlignments(), and continue.
 371
 372     for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
 373         if (!it->first->SetRegion(region)) {
 374             cerr << "ERROR: could not jump " << it->first->GetFilename() << " to "
 375                 << region.LeftRefID << ":" << region.LeftPosition
 376                 << ".." << region.RightRefID << ":" << region.RightPosition << endl;
 377         }
 378     }
 379
 380     UpdateAlignments();
 381     return true;
 382 }
 383
 384 void BamMultiReader::UpdateAlignments(void) {
 385     // Update Alignments
 386     alignments.clear();
 387     for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
 388         BamReader* br = it->first;
 389         BamAlignment* ba = it->second;
 390         if (br->GetNextAlignment(*ba)) {
 391             alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
 392                                         make_pair(br, ba)));
 393         } else {
 394             // assume BamReader end of region / EOF
 395         }
 396     }
 397 }
 398
 399 // updates the reference id stored in the BamMultiReader
 400 // to reflect the current state of the readers
 401 void BamMultiReader::UpdateReferenceID(void) {
 402     // the alignments are sorted by position, so the first alignment will always have the lowest reference ID
 403     if (alignments.begin()->second.second->RefID != CurrentRefID) {
 404         // get the next reference id
 405         // while there aren't any readers at the next ref id
 406         // increment the ref id
 407         int nextRefID = CurrentRefID;
 408         while (alignments.begin()->second.second->RefID != nextRefID) {
 409             ++nextRefID;
 410         }
 411         //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
 412         CurrentRefID = nextRefID;
 413     }
 414 }
 415
 416 // ValidateReaders checks that all the readers point to BAM files representing
 417 // alignments against the same set of reference sequences, and that the
 418 // sequences are identically ordered.  If these checks fail the operation of
 419 // the multireader is undefined, so we force program exit.
 420 void BamMultiReader::ValidateReaders(void) const {
 421     int firstRefCount = readers.front().first->GetReferenceCount();
 422     BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
 423     for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
 424         BamReader* reader = it->first;
 425         BamTools::RefVector currentRefData = reader->GetReferenceData();
 426         BamTools::RefVector::const_iterator f = firstRefData.begin();
 427         BamTools::RefVector::const_iterator c = currentRefData.begin();
 428         if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
 429             cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
 430                       << " expected " << firstRefCount
 431                       << " reference sequences but only found " << reader->GetReferenceCount() << endl;
 432             exit(1);
 433         }
 434         // this will be ok; we just checked above that we have identically-sized sets of references
 435         // here we simply check if they are all, in fact, equal in content
 436         while (f != firstRefData.end()) {
 437             if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
 438                 cerr << "ERROR: mismatched references found in " << reader->GetFilename()
 439                           << " expected: " << endl;
 440                 for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
 441                     cerr << a->RefName << " " << a->RefLength << endl;
 442                 cerr << "but found: " << endl;
 443                 for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
 444                     cerr << a->RefName << " " << a->RefLength << endl;
 445                 exit(1);
 446             }
 447             ++f; ++c;
 448         }
 449     }
 450 }