1 // ***************************************************************************
2 // BamMultiReader.cpp (c) 2010 Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 23 Februrary 2010 (EG)
7 // ---------------------------------------------------------------------------
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
10 // ---------------------------------------------------------------------------
11 // Functionality for simultaneously reading multiple BAM files.
13 // This functionality allows applications to work on very large sets of files
14 // without requiring intermediate merge, sort, and index steps for each file
15 // subset. It also improves the performance of our merge system as it
16 // precludes the need to sort merged files.
17 // ***************************************************************************
29 #include "BamMultiReader.h"
30 using namespace BamTools;
33 // -----------------------------------------------------
34 // BamMultiReader implementation
35 // -----------------------------------------------------
38 BamMultiReader::BamMultiReader(void)
44 BamMultiReader::~BamMultiReader(void) {
45 Close(); // close the bam files
46 // clean up reader objects
47 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
52 // close the BAM files
53 void BamMultiReader::Close(void) {
55 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
56 BamReader* reader = *br;
57 reader->Close(); // close the reader
58 readerStates.at(index++) = CLOSED; // track its state
62 // updates the reference id stored in the BamMultiReader
63 // to reflect the current state of the readers
64 void BamMultiReader::UpdateReferenceID(void) {
65 bool updateRefID = true;
67 for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
68 BamAlignment* alignment = *it;
69 if (readerStates.at(i++) != END && alignment->RefID == CurrentRefID) {
75 // get the next reference id
76 // while there aren't any readers at the next ref id
77 // increment the ref id
78 int nextRefID = CurrentRefID;
83 for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
84 BamAlignment* alignment = *it;
85 if (readerStates.at(i++) != END && alignment->RefID == nextRefID) {
91 //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
92 CurrentRefID = nextRefID;
96 // checks if any readers still have alignments
97 bool BamMultiReader::HasOpenReaders() {
98 for (vector<BamReaderState>::iterator it = readerStates.begin(); it != readerStates.end(); ++it) {
99 BamReaderState readerState = *it;
100 if (readerState != END)
106 // get next alignment among all files (from specified region, if given)
107 bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
109 // bail out if we are at EOF in all files, means no more alignments to process
110 if (!HasOpenReaders())
113 // when all alignments have stepped into a new target sequence, update our
114 // current reference sequence id
117 // find the alignment with the lowest position that is also in the current
118 // reference sequence, getnextalignment on its bam reader to step it
119 // forward, and return it
120 int i = 0, lowestPosition = -1, lowestAlignmentIndex = -1;
121 BamAlignment* lowestAlignment = NULL;
122 for (vector<BamAlignment*>::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
123 BamAlignment* ba = *it;
124 if (readerStates.at(i) != END &&
125 ba->RefID == CurrentRefID &&
126 (lowestAlignment == NULL || ba->Position < lowestPosition)) {
127 lowestPosition = ba->Position;
128 lowestAlignmentIndex = i;
129 lowestAlignment = ba;
134 // now that we have the lowest alignment in the set, save it by copy to our argument
135 nextAlignment = BamAlignment(*lowestAlignment);
137 // else continue and load the next alignment
138 bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex));
140 //cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl;
141 readerStates.at(lowestAlignmentIndex) = END; // set flag for end of file
147 // jumps to specified region(refID, leftBound) in BAM files, returns success/fail
148 bool BamMultiReader::Jump(int refID, int position) {
150 //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
151 CurrentRefID = refID;
152 CurrentLeft = position;
155 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
156 BamReader* reader = *br;
157 result &= reader->Jump(refID, position);
165 void BamMultiReader::Open(const vector<string> filenames, bool openIndexes) {
166 // for filename in filenames
167 fileNames = filenames; // save filenames in our multireader
168 for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
169 string filename = *it;
170 BamReader* reader = new BamReader;
172 reader->Open(filename, filename + ".bai");
174 reader->Open(filename); // for merging, jumping is disallowed
176 BamAlignment* alignment = new BamAlignment;
177 reader->GetNextAlignment(*alignment);
178 readers.push_back(reader); // tracks readers
179 alignments.push_back(alignment); // tracks current alignments of the readers
180 BamReaderState readerState = START;
181 readerStates.push_back(readerState);
186 // Runs GetNextAlignment for all BAM readers; used during jumps
187 void BamMultiReader::UpdateAlignments(void) {
189 for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
190 BamAlignment* ba = *it;
191 readers.at(i++)->GetNextAlignment(*ba);
195 void BamMultiReader::PrintFilenames(void) {
196 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
197 BamReader* reader = *br;
198 cout << reader->GetFilename() << endl;
202 // returns BAM file pointers to beginning of alignment data
203 bool BamMultiReader::Rewind(void) {
206 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
207 BamReader* reader = *br;
208 result &= reader->Rewind();
209 readerStates.at(index++) = START;
214 // saves index data to BAM index files (".bai") where necessary, returns success/fail
215 bool BamMultiReader::CreateIndexes(void) {
217 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
218 BamReader* reader = *br;
219 result &= reader->CreateIndex();
224 // makes a virtual, unified header for all the bam files in the multireader
225 const string BamMultiReader::GetHeaderText(void) const {
227 string mergedHeader = "";
229 // foreach extraction entry (each BAM file)
230 bool isFirstTime = true;
231 for (vector<BamReader*>::const_iterator it = readers.begin(); it != readers.end(); ++it) {
233 BamReader* reader = *it;
235 stringstream header(reader->GetHeaderText());
236 vector<string> lines;
238 while (getline(header, item))
239 lines.push_back(item);
241 for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
243 // get next line from header, skip if empty
244 string headerLine = *it;
245 if ( headerLine.empty() ) { continue; }
247 // if first file, save HD & SQ entries
249 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
250 mergedHeader.append(headerLine.c_str());
251 mergedHeader.append(1, '\n');
255 // (for all files) append RG entries
256 if ( headerLine.find("@RG") == 0 ) {
257 mergedHeader.append(headerLine.c_str() );
258 mergedHeader.append(1, '\n');
263 // set iteration flag
267 // return merged header text
271 // ValidateReaders checks that all the readers point to BAM files representing
272 // alignments against the same set of reference sequences, and that the
273 // sequences are identically ordered. If these checks fail the operation of
274 // the multireader is undefined, so we force program exit.
275 void BamMultiReader::ValidateReaders(void) const {
276 int firstRefCount = readers.front()->GetReferenceCount();
277 BamTools::RefVector firstRefData = readers.front()->GetReferenceData();
278 for (vector<BamReader*>::const_iterator it = readers.begin(); it != readers.end(); ++it) {
279 BamTools::RefVector currentRefData = (*it)->GetReferenceData();
280 BamTools::RefVector::const_iterator f = firstRefData.begin();
281 BamTools::RefVector::const_iterator c = currentRefData.begin();
282 if ((*it)->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
283 cerr << "ERROR: mismatched number of references in " << (*it)->GetFilename()
284 << " expected " << firstRefCount
285 << " reference sequences but only found " << (*it)->GetReferenceCount() << endl;
288 // this will be ok; we just checked above that we have identically-sized sets of references
289 // here we simply check if they are all, in fact, equal in content
290 while (f != firstRefData.end()) {
291 if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
292 cerr << "ERROR: mismatched references found in " << (*it)->GetFilename()
293 << " expected: " << endl;
294 for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
295 cerr << a->RefName << " " << a->RefLength << endl;
296 cerr << "but found: " << endl;
297 for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
298 cerr << a->RefName << " " << a->RefLength << endl;
306 // NB: The following functions assume that we have identical references for all
307 // BAM files. We enforce this by invoking the above validation function
308 // (ValidateReaders) to verify that our reference data is the same across all
309 // files on Open, so we will not encounter a situation in which there is a
310 // mismatch and we are still live.
312 // returns the number of reference sequences
313 const int BamMultiReader::GetReferenceCount(void) const {
314 return readers.front()->GetReferenceCount();
317 // returns vector of reference objects
318 const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
319 return readers.front()->GetReferenceData();
322 const int BamMultiReader::GetReferenceID(const string& refName) const {
323 return readers.front()->GetReferenceID(refName);