BamMultiReader.cpp

   1 // ***************************************************************************
   2 // BamMultiReader.cpp (c) 2010 Erik Garrison
   3 // Marth Lab, Department of Biology, Boston College
   4 // All rights reserved.
   5 // ---------------------------------------------------------------------------
   6 // Last modified: 23 Februrary 2010 (EG)
   7 // ---------------------------------------------------------------------------
   8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
   9 // Institute.
  10 // ---------------------------------------------------------------------------
  11 // Functionality for simultaneously reading multiple BAM files.
  12 //
  13 // This functionality allows applications to work on very large sets of files
  14 // without requiring intermediate merge, sort, and index steps for each file
  15 // subset.  It also improves the performance of our merge system as it
  16 // precludes the need to sort merged files.
  17 // ***************************************************************************
  18
  19 // C++ includes
  20 #include <algorithm>
  21 #include <iterator>
  22 #include <string>
  23 #include <vector>
  24 #include <iostream>
  25 #include <sstream>
  26
  27 // BamTools includes
  28 #include "BGZF.h"
  29 #include "BamMultiReader.h"
  30 using namespace BamTools;
  31 using namespace std;
  32
  33 // -----------------------------------------------------
  34 // BamMultiReader implementation
  35 // -----------------------------------------------------
  36
  37 // constructor
  38 BamMultiReader::BamMultiReader(void)
  39     : CurrentRefID(0)
  40     , CurrentLeft(0)
  41 { }
  42
  43 // destructor
  44 BamMultiReader::~BamMultiReader(void) {
  45     Close(); // close the bam files
  46     // clean up reader objects
  47     for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
  48         delete *br;
  49     }
  50 }
  51
  52 // close the BAM files
  53 void BamMultiReader::Close(void) {
  54     int index = 0;
  55     for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
  56         BamReader* reader = *br;
  57         reader->Close();  // close the reader
  58         readerStates.at(index++) = CLOSED;  // track its state
  59     }
  60 }
  61
  62 // updates the reference id stored in the BamMultiReader
  63 // to reflect the current state of the readers
  64 void BamMultiReader::UpdateReferenceID(void) {
  65     bool updateRefID = true;
  66     int i = 0;
  67     for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
  68         BamAlignment* alignment = *it;
  69         if (readerStates.at(i++) != END && alignment->RefID == CurrentRefID) {
  70             updateRefID = false;
  71             break;
  72         }
  73     }
  74     if (updateRefID) {
  75         // get the next reference id
  76         // while there aren't any readers at the next ref id
  77         // increment the ref id
  78         int nextRefID = CurrentRefID;
  79         bool ok = false;
  80         while (!ok) {
  81             ++nextRefID;
  82             int i = 0;
  83             for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
  84                 BamAlignment* alignment = *it;
  85                 if (readerStates.at(i++) != END && alignment->RefID == nextRefID) {
  86                     ok = true;
  87                     break;
  88                 }
  89             }
  90         }
  91         cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
  92         CurrentRefID = nextRefID;
  93     }
  94 }
  95
  96 // checks if any readers still have alignments
  97 bool BamMultiReader::HasOpenReaders() {
  98     for (vector<BamReaderState>::iterator it = readerStates.begin(); it != readerStates.end(); ++it) {
  99         BamReaderState readerState = *it;
 100         if (readerState != END)
 101             return true;
 102     }
 103     return false;
 104 }
 105
 106 // get next alignment among all files (from specified region, if given)
 107 bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
 108
 109     // bail out if we are at EOF in all files, means no more alignments to process
 110     if (!HasOpenReaders())
 111         return false;
 112
 113     // when all alignments have stepped into a new target sequence, update our
 114     // current reference sequence id
 115     UpdateReferenceID();
 116
 117     // find the alignment with the lowest position that is also in the current
 118     // reference sequence, getnextalignment on its bam reader to step it
 119     // forward, and return it
 120     int i = 0, lowestPosition = -1, lowestAlignmentIndex = -1;
 121     BamAlignment* lowestAlignment = NULL;
 122     for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
 123         BamAlignment* ba = *it;
 124         if (readerStates.at(i) != END &&
 125                 ba->RefID == CurrentRefID &&
 126                 (lowestAlignment == NULL || ba->Position < lowestPosition)) {
 127             lowestPosition = ba->Position;
 128             lowestAlignmentIndex = i;
 129             lowestAlignment = ba;
 130         }
 131         ++i;
 132     }
 133
 134     // now that we have the lowest alignment in the set, save it by copy to our argument
 135     nextAlignment = BamAlignment(*lowestAlignment);
 136
 137     // else continue and load the next alignment
 138     bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex));
 139     if (!r) {
 140         cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl;
 141         readerStates.at(lowestAlignmentIndex) = END;  // set flag for end of file
 142     }
 143
 144     return true;
 145 }
 146
 147 // jumps to specified region(refID, leftBound) in BAM files, returns success/fail
 148 bool BamMultiReader::Jump(int refID, int position) {
 149
 150     //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
 151     CurrentRefID = refID;
 152     CurrentLeft  = position;
 153
 154     bool result = true;
 155     for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
 156         BamReader* reader = *br;
 157         result &= reader->Jump(refID, position);
 158     }
 159     if (result)
 160         UpdateAlignments();
 161     return result;
 162 }
 163
 164 // opens BAM files
 165 void BamMultiReader::Open(const vector<string> filenames) {
 166     // for filename in filenames
 167     fileNames = filenames; // save filenames in our multireader
 168     for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
 169         string filename = *it;
 170         BamReader* reader = new BamReader;
 171         reader->Open(filename, filename + ".bai");
 172         BamAlignment* alignment = new BamAlignment;
 173         reader->GetNextAlignment(*alignment);
 174         readers.push_back(reader); // tracks readers
 175         alignments.push_back(alignment); // tracks current alignments of the readers
 176         BamReaderState readerState = START;
 177         readerStates.push_back(readerState);
 178     }
 179 }
 180
 181 // Runs GetNextAlignment for all BAM readers; used during jumps
 182 void BamMultiReader::UpdateAlignments(void) {
 183     int i = 0;
 184     for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
 185         BamAlignment* ba = *it;
 186         readers.at(i++)->GetNextAlignment(*ba);
 187     }
 188 }
 189
 190 void BamMultiReader::PrintFilenames(void) {
 191     for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
 192         BamReader* reader = *br;
 193         cout << reader->GetFilename() << endl;
 194     }
 195 }
 196
 197 // returns BAM file pointers to beginning of alignment data
 198 bool BamMultiReader::Rewind(void) {
 199     bool result = true;
 200     int index = 0;
 201     for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
 202         BamReader* reader = *br;
 203         result &= reader->Rewind();
 204         readerStates.at(index++) = START;
 205     }
 206     return result;
 207 }
 208
 209 // saves index data to BAM index files (".bai") where necessary, returns success/fail
 210 bool BamMultiReader::CreateIndexes(void) {
 211     bool result = true;
 212     for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
 213         BamReader* reader = *br;
 214         result &= reader->CreateIndex();
 215     }
 216     return result;
 217 }
 218
 219 // makes a virtual, unified header for all the bam files in the multireader
 220 const string BamMultiReader::GetUnifiedHeaderText(void) const {
 221
 222     string mergedHeader = "";
 223
 224     // foreach extraction entry (each BAM file)
 225     bool isFirstTime = true;
 226     for (vector<BamReader*>::const_iterator it = readers.begin(); it != readers.end(); ++it) {
 227
 228         BamReader* reader = *it;
 229
 230         stringstream header(reader->GetHeaderText());
 231         vector<string> lines;
 232         string item;
 233         while (getline(header, item))
 234             lines.push_back(item);
 235
 236         for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
 237
 238             // get next line from header, skip if empty
 239             string headerLine = *it;
 240             if ( headerLine.empty() ) { continue; }
 241
 242             // if first file, save HD & SQ entries
 243             if ( isFirstTime ) {
 244                 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
 245                     mergedHeader.append(headerLine.c_str());
 246                     mergedHeader.append(1, '\n');
 247                 }
 248             }
 249
 250             // (for all files) append RG entries
 251             if ( headerLine.find("@RG") == 0 ) {
 252                 mergedHeader.append(headerLine.c_str() );
 253                 mergedHeader.append(1, '\n');
 254             }
 255
 256         }
 257
 258         // set iteration flag
 259         isFirstTime = false;
 260     }
 261
 262     // return merged header text
 263     return mergedHeader;
 264 }
 265