// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 23 Februrary 2010 (EG)
+// Last modified: 20 July 2010 (DB)
// ---------------------------------------------------------------------------
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
// Institute.
return alignments.size() > 0;
}
-// get next alignment among all files (from specified region, if given)
+// get next alignment among all files
bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
// bail out if we are at EOF in all files, means no more alignments to process
UpdateReferenceID();
// our lowest alignment and reader will be at the front of our alignment index
- BamAlignment* lowestAlignment = alignments.begin()->second.second;
- BamReader* lowestReader = alignments.begin()->second.first;
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
// now that we have the lowest alignment in the set, save it by copy to our argument
- nextAlignment = BamAlignment(*lowestAlignment);
+ nextAlignment = BamAlignment(*alignment);
// remove this alignment index entry from our alignment index
alignments.erase(alignments.begin());
// and add another entry if we can get another alignment from the reader
- if (lowestReader->GetNextAlignment(*lowestAlignment)) {
- alignments.insert(make_pair(make_pair(lowestAlignment->RefID, lowestAlignment->Position),
- make_pair(lowestReader, lowestAlignment)));
+ if (reader->GetNextAlignment(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
} else { // do nothing
//cerr << "reached end of file " << lowestReader->GetFilename() << endl;
}
return true;
+
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+ //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignmentCore(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
}
// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
exit(1);
}
}
- if (result) {
- // Update Alignments
- alignments.clear();
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* br = it->first;
- BamAlignment* ba = it->second;
- if (br->GetNextAlignment(*ba)) {
- alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
- make_pair(br, ba)));
- } else {
- // assume BamReader EOF
- }
+ if (result) UpdateAlignments();
+ return result;
+}
+
+bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
+
+ BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
+
+ return SetRegion(region);
+
+}
+
+bool BamMultiReader::SetRegion(const BamRegion& region) {
+
+ Region = region;
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully SetRegion, In practice a failure of SetRegion means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ it->first->SetRegion(region);
+ }
+
+ UpdateAlignments();
+
+ return true;
+
+}
+
+void BamMultiReader::UpdateAlignments(void) {
+ // Update Alignments
+ alignments.clear();
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* br = it->first;
+ BamAlignment* ba = it->second;
+ if (br->GetNextAlignment(*ba)) {
+ alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
+ make_pair(br, ba)));
+ } else {
+ // assume BamReader end of region / EOF
}
}
- return result;
}
// opens BAM files
-void BamMultiReader::Open(const vector<string> filenames, bool openIndexes) {
+bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
+
// for filename in filenames
fileNames = filenames; // save filenames in our multireader
for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
string filename = *it;
BamReader* reader = new BamReader;
+
+ bool openedOK = true;
if (openIndexes) {
- reader->Open(filename, filename + ".bai");
+ if (useDefaultIndex)
+ openedOK = reader->Open(filename, filename + ".bai");
+ else
+ openedOK = reader->Open(filename, filename + ".bti");
} else {
- reader->Open(filename); // for merging, jumping is disallowed
+ openedOK = reader->Open(filename); // for merging, jumping is disallowed
}
- BamAlignment* alignment = new BamAlignment;
- reader->GetNextAlignment(*alignment);
- readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
- alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
- make_pair(reader, alignment)));
+
+ // if file opened ok, check that it can be read
+ if ( openedOK ) {
+
+ bool fileOK = true;
+ BamAlignment* alignment = new BamAlignment;
+ if (coreMode) {
+ fileOK &= reader->GetNextAlignmentCore(*alignment);
+ } else {
+ fileOK &= reader->GetNextAlignment(*alignment);
+ }
+
+ if (fileOK) {
+ readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else {
+ cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
+ // if only file available & could not be read, return failure
+ if ( filenames.size() == 1 ) return false;
+ }
+
+ }
+
+ // TODO; any more error handling on openedOK ??
+ else
+ return false;
}
+
+ // files opened ok, at least one alignment could be read,
+ // now need to check that all files use same reference data
ValidateReaders();
+ return true;
}
void BamMultiReader::PrintFilenames(void) {
return result;
}
-// saves index data to BAM index files (".bai") where necessary, returns success/fail
-bool BamMultiReader::CreateIndexes(void) {
+// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
+bool BamMultiReader::CreateIndexes(bool useDefaultIndex) {
bool result = true;
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
- result &= reader->CreateIndex();
+ result &= reader->CreateIndex(useDefaultIndex);
}
return result;
}
const string BamMultiReader::GetHeaderText(void) const {
string mergedHeader = "";
+ map<string, bool> readGroups;
// foreach extraction entry (each BAM file)
- bool isFirstTime = true;
- for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
- BamReader* reader = it->first;
+ map<string, bool> currentFileReadGroups;
+
+ BamReader* reader = rs->first;
stringstream header(reader->GetHeaderText());
vector<string> lines;
if ( headerLine.empty() ) { continue; }
// if first file, save HD & SQ entries
- if ( isFirstTime ) {
+ if ( rs == readers.begin() ) {
if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
mergedHeader.append(headerLine.c_str());
mergedHeader.append(1, '\n');
}
}
- // (for all files) append RG entries
+ // (for all files) append RG entries if they are unique
if ( headerLine.find("@RG") == 0 ) {
- mergedHeader.append(headerLine.c_str() );
- mergedHeader.append(1, '\n');
+ stringstream headerLineSs(headerLine);
+ string part, readGroupPart, readGroup;
+ while(std::getline(headerLineSs, part, '\t')) {
+ stringstream partSs(part);
+ string subtag;
+ std::getline(partSs, subtag, ':');
+ if (subtag == "ID") {
+ std::getline(partSs, readGroup, ':');
+ break;
+ }
+ }
+ if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
+ mergedHeader.append(headerLine.c_str() );
+ mergedHeader.append(1, '\n');
+ readGroups[readGroup] = true;
+ currentFileReadGroups[readGroup] = true;
+ } else {
+ // warn iff we are reading one file and discover duplicated @RG tags in the header
+ // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
+ if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
+ cerr << "WARNING: duplicate @RG tag " << readGroup
+ << " entry in header of " << reader->GetFilename() << endl;
+ }
+ }
}
-
}
-
- // set iteration flag
- isFirstTime = false;
}
// return merged header text