1 // ***************************************************************************
2 // BamMultiReader.cpp (c) 2010 Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 23 Februrary 2010 (EG)
7 // ---------------------------------------------------------------------------
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
10 // ---------------------------------------------------------------------------
11 // Functionality for simultaneously reading multiple BAM files.
13 // This functionality allows applications to work on very large sets of files
14 // without requiring intermediate merge, sort, and index steps for each file
15 // subset. It also improves the performance of our merge system as it
16 // precludes the need to sort merged files.
17 // ***************************************************************************
29 #include "BamMultiReader.h"
30 using namespace BamTools;
33 // -----------------------------------------------------
34 // BamMultiReader implementation
35 // -----------------------------------------------------
38 BamMultiReader::BamMultiReader(void)
44 BamMultiReader::~BamMultiReader(void) {
45 Close(); // close the bam files
46 // clean up reader objects
47 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
52 // close the BAM files
53 void BamMultiReader::Close(void) {
55 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
56 BamReader* reader = *br;
57 reader->Close(); // close the reader
58 readerStates.at(index++) = CLOSED; // track its state
62 // updates the reference id stored in the BamMultiReader
63 // to reflect the current state of the readers
64 void BamMultiReader::UpdateReferenceID(void) {
65 bool updateRefID = true;
67 for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
68 BamAlignment* alignment = *it;
69 if (readerStates.at(i++) != END && alignment->RefID == CurrentRefID) {
75 // get the next reference id
76 // while there aren't any readers at the next ref id
77 // increment the ref id
78 int nextRefID = CurrentRefID;
83 for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
84 BamAlignment* alignment = *it;
85 if (readerStates.at(i++) != END && alignment->RefID == nextRefID) {
91 cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
92 CurrentRefID = nextRefID;
96 // checks if any readers still have alignments
97 bool BamMultiReader::HasOpenReaders() {
98 for (vector<BamReaderState>::iterator it = readerStates.begin(); it != readerStates.end(); ++it) {
99 BamReaderState readerState = *it;
100 if (readerState != END)
106 // get next alignment among all files (from specified region, if given)
107 bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
109 // bail out if we are at EOF in all files, means no more alignments to process
110 if (!HasOpenReaders())
113 // when all alignments have stepped into a new target sequence, update our
114 // current reference sequence id
117 // find the alignment with the lowest position that is also in the current
118 // reference sequence, getnextalignment on its bam reader to step it
119 // forward, and return it
120 int i = 0, lowestPosition = -1, lowestAlignmentIndex = -1;
121 BamAlignment* lowestAlignment = NULL;
122 for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
123 BamAlignment* ba = *it;
124 if (readerStates.at(i) != END &&
125 ba->RefID == CurrentRefID &&
126 (lowestAlignment == NULL || ba->Position < lowestPosition)) {
127 lowestPosition = ba->Position;
128 lowestAlignmentIndex = i;
129 lowestAlignment = ba;
134 // now that we have the lowest alignment in the set, save it by copy to our argument
135 nextAlignment = BamAlignment(*lowestAlignment);
137 // else continue and load the next alignment
138 bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex));
140 cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl;
141 readerStates.at(lowestAlignmentIndex) = END; // set flag for end of file
147 // jumps to specified region(refID, leftBound) in BAM files, returns success/fail
148 bool BamMultiReader::Jump(int refID, int position) {
150 //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
151 CurrentRefID = refID;
152 CurrentLeft = position;
155 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
156 BamReader* reader = *br;
157 result &= reader->Jump(refID, position);
165 void BamMultiReader::Open(const vector<string> filenames) {
166 // for filename in filenames
167 fileNames = filenames; // save filenames in our multireader
168 for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
169 string filename = *it;
170 BamReader* reader = new BamReader;
171 reader->Open(filename, filename + ".bai");
172 BamAlignment* alignment = new BamAlignment;
173 reader->GetNextAlignment(*alignment);
174 readers.push_back(reader); // tracks readers
175 alignments.push_back(alignment); // tracks current alignments of the readers
176 BamReaderState readerState = START;
177 readerStates.push_back(readerState);
181 // Runs GetNextAlignment for all BAM readers; used during jumps
182 void BamMultiReader::UpdateAlignments(void) {
184 for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
185 BamAlignment* ba = *it;
186 readers.at(i++)->GetNextAlignment(*ba);
190 void BamMultiReader::PrintFilenames(void) {
191 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
192 BamReader* reader = *br;
193 cout << reader->GetFilename() << endl;
197 // returns BAM file pointers to beginning of alignment data
198 bool BamMultiReader::Rewind(void) {
201 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
202 BamReader* reader = *br;
203 result &= reader->Rewind();
204 readerStates.at(index++) = START;
209 // saves index data to BAM index files (".bai") where necessary, returns success/fail
210 bool BamMultiReader::CreateIndexes(void) {
212 for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
213 BamReader* reader = *br;
214 result &= reader->CreateIndex();
219 // makes a virtual, unified header for all the bam files in the multireader
220 const string BamMultiReader::GetUnifiedHeaderText(void) const {
222 string mergedHeader = "";
224 // foreach extraction entry (each BAM file)
225 bool isFirstTime = true;
226 for (vector<BamReader*>::const_iterator it = readers.begin(); it != readers.end(); ++it) {
228 BamReader* reader = *it;
230 stringstream header(reader->GetHeaderText());
231 vector<string> lines;
233 while (getline(header, item))
234 lines.push_back(item);
236 for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
238 // get next line from header, skip if empty
239 string headerLine = *it;
240 if ( headerLine.empty() ) { continue; }
242 // if first file, save HD & SQ entries
244 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
245 mergedHeader.append(headerLine.c_str());
246 mergedHeader.append(1, '\n');
250 // (for all files) append RG entries
251 if ( headerLine.find("@RG") == 0 ) {
252 mergedHeader.append(headerLine.c_str() );
253 mergedHeader.append(1, '\n');
258 // set iteration flag
262 // return merged header text