From 164366c9fd23717e4b7279518eecaa32baed502c Mon Sep 17 00:00:00 2001 From: Derek Date: Fri, 3 Sep 2010 00:37:39 -0400 Subject: [PATCH] Cleaned up index file handling throughout toolkit. Did this by adding a FileExists() methods to BamMultiReader for determining which index file to load. --- src/api/BamMultiReader.cpp | 399 ++++++++++++++++++-------------- src/api/BamMultiReader.h | 9 +- src/toolkit/bamtools_count.cpp | 132 ++++------- src/toolkit/bamtools_filter.cpp | 156 ++++++------- src/toolkit/bamtools_random.cpp | 113 +++++---- 5 files changed, 414 insertions(+), 395 deletions(-) diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index 005b0b0..6de97dc 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 20 July 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -18,11 +18,12 @@ // C++ includes #include +#include +#include #include +#include #include #include -#include -#include // BamTools includes #include "BGZF.h" @@ -30,6 +31,15 @@ using namespace BamTools; using namespace std; +namespace BamTools { + +bool FileExists(const string& filename) { + ifstream f(filename.c_str(), ifstream::in); + return !f.fail(); +} + +} // namespace BamTools + // ----------------------------------------------------- // BamMultiReader implementation // ----------------------------------------------------- @@ -42,42 +52,118 @@ BamMultiReader::BamMultiReader(void) // destructor BamMultiReader::~BamMultiReader(void) { - Close(); // close the bam files - // clean up reader objects - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - delete it->first; - delete it->second; - } + Close(); } // close the BAM files void BamMultiReader::Close(void) { + + // close all BAM readers and clean up pointers + vector >::iterator readerIter = readers.begin(); + vector >::iterator readerEnd = readers.end(); + for ( ; readerIter != readerEnd; ++readerIter) { + + BamReader* reader = (*readerIter).first; + BamAlignment* alignment = (*readerIter).second; + + // close the reader + if ( reader) reader->Close(); + + // delete reader pointer + delete reader; + reader = 0; + + // delete alignment pointer + delete alignment; + alignment = 0; + } + + // clear out the container + readers.clear(); +} + +// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail +bool BamMultiReader::CreateIndexes(bool useDefaultIndex) { + bool result = true; for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; - reader->Close(); // close the reader + result &= reader->CreateIndex(useDefaultIndex); } + return result; } -// updates the reference id stored in the BamMultiReader -// to reflect the current state of the readers -void BamMultiReader::UpdateReferenceID(void) { - // the alignments are sorted by position, so the first alignment will always have the lowest reference ID - if (alignments.begin()->second.second->RefID != CurrentRefID) { - // get the next reference id - // while there aren't any readers at the next ref id - // increment the ref id - int nextRefID = CurrentRefID; - while (alignments.begin()->second.second->RefID != nextRefID) { - ++nextRefID; - } - //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; - CurrentRefID = nextRefID; +// for debugging +void BamMultiReader::DumpAlignmentIndex(void) { + for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { + cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; } } -// checks if any readers still have alignments -bool BamMultiReader::HasOpenReaders() { - return alignments.size() > 0; +// makes a virtual, unified header for all the bam files in the multireader +const string BamMultiReader::GetHeaderText(void) const { + + string mergedHeader = ""; + map readGroups; + + // foreach extraction entry (each BAM file) + for (vector >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { + + map currentFileReadGroups; + + BamReader* reader = rs->first; + + stringstream header(reader->GetHeaderText()); + vector lines; + string item; + while (getline(header, item)) + lines.push_back(item); + + for (vector::const_iterator it = lines.begin(); it != lines.end(); ++it) { + + // get next line from header, skip if empty + string headerLine = *it; + if ( headerLine.empty() ) { continue; } + + // if first file, save HD & SQ entries + if ( rs == readers.begin() ) { + if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { + mergedHeader.append(headerLine.c_str()); + mergedHeader.append(1, '\n'); + } + } + + // (for all files) append RG entries if they are unique + if ( headerLine.find("@RG") == 0 ) { + stringstream headerLineSs(headerLine); + string part, readGroupPart, readGroup; + while(std::getline(headerLineSs, part, '\t')) { + stringstream partSs(part); + string subtag; + std::getline(partSs, subtag, ':'); + if (subtag == "ID") { + std::getline(partSs, readGroup, ':'); + break; + } + } + if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries + mergedHeader.append(headerLine.c_str() ); + mergedHeader.append(1, '\n'); + readGroups[readGroup] = true; + currentFileReadGroups[readGroup] = true; + } else { + // warn iff we are reading one file and discover duplicated @RG tags in the header + // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags + if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { + cerr << "WARNING: duplicate @RG tag " << readGroup + << " entry in header of " << reader->GetFilename() << endl; + } + } + } + } + } + + // return merged header text + return mergedHeader; } // get next alignment among all files @@ -147,6 +233,51 @@ bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { } +// --------------------------------------------------------------------------------------- +// +// NB: The following GetReferenceX() functions assume that we have identical +// references for all BAM files. We enforce this by invoking the above +// validation function (ValidateReaders) to verify that our reference data +// is the same across all files on Open, so we will not encounter a situation +// in which there is a mismatch and we are still live. +// +// --------------------------------------------------------------------------------------- + +// returns the number of reference sequences +const int BamMultiReader::GetReferenceCount(void) const { + return readers.front().first->GetReferenceCount(); +} + +// returns vector of reference objects +const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { + return readers.front().first->GetReferenceData(); +} + +// returns refID from reference name +const int BamMultiReader::GetReferenceID(const string& refName) const { + return readers.front().first->GetReferenceID(refName); +} + +// --------------------------------------------------------------------------------------- + +// checks if any readers still have alignments +bool BamMultiReader::HasOpenReaders() { + return alignments.size() > 0; +} + +// returns whether underlying BAM readers ALL have an index loaded +// this is useful to indicate whether Jump() or SetRegion() are possible +bool BamMultiReader::IsIndexLoaded(void) const { + bool ok = true; + vector >::const_iterator readerIter = readers.begin(); + vector >::const_iterator readerEnd = readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader ) ok &= reader->IsIndexLoaded(); + } + return ok; +} + // jumps to specified region(refID, leftBound) in BAM files, returns success/fail bool BamMultiReader::Jump(int refID, int position) { @@ -167,77 +298,44 @@ bool BamMultiReader::Jump(int refID, int position) { return result; } -bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { - - BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); - - return SetRegion(region); - -} - -bool BamMultiReader::SetRegion(const BamRegion& region) { - - Region = region; - - // NB: While it may make sense to track readers in which we can - // successfully SetRegion, In practice a failure of SetRegion means "no - // alignments here." It makes sense to simply accept the failure, - // UpdateAlignments(), and continue. - - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - it->first->SetRegion(region); - } - - UpdateAlignments(); - - return true; - -} - -void BamMultiReader::UpdateAlignments(void) { - // Update Alignments - alignments.clear(); - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* br = it->first; - BamAlignment* ba = it->second; - if (br->GetNextAlignment(*ba)) { - alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), - make_pair(br, ba))); - } else { - // assume BamReader end of region / EOF - } - } -} - // opens BAM files bool BamMultiReader::Open(const vector filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { // for filename in filenames fileNames = filenames; // save filenames in our multireader for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { - string filename = *it; + + const string filename = *it; BamReader* reader = new BamReader; bool openedOK = true; if (openIndexes) { - if (useDefaultIndex) - openedOK = reader->Open(filename, filename + ".bai"); - else - openedOK = reader->Open(filename, filename + ".bti"); - } else { - openedOK = reader->Open(filename); // for merging, jumping is disallowed - } + + const string defaultIndexFilename = filename + ".bai"; + const string bamToolsIndexFilename = filename + ".bti"; + + // if default BAM index requested and one exists + if ( useDefaultIndex && FileExists(defaultIndexFilename) ) + openedOK = reader->Open(filename, defaultIndexFilename); + + // else see if BamTools index exists + else if ( FileExists(bamToolsIndexFilename) ) + openedOK = reader->Open(filename, bamToolsIndexFilename); + + // else none exist... just open without + else + openedOK = reader->Open(filename); + } + + // ignoring index file(s) + else openedOK = reader->Open(filename); // if file opened ok, check that it can be read if ( openedOK ) { bool fileOK = true; BamAlignment* alignment = new BamAlignment; - if (coreMode) { - fileOK &= reader->GetNextAlignmentCore(*alignment); - } else { - fileOK &= reader->GetNextAlignment(*alignment); - } + fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) ); if (fileOK) { readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup @@ -248,10 +346,9 @@ bool BamMultiReader::Open(const vector filenames, bool openIndexes, bool // if only file available & could not be read, return failure if ( filenames.size() == 1 ) return false; } - } - // TODO; any more error handling on openedOK ?? + // TODO; any more error handling when openedOKvis false ?? else return false; } @@ -269,13 +366,6 @@ void BamMultiReader::PrintFilenames(void) { } } -// for debugging -void BamMultiReader::DumpAlignmentIndex(void) { - for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { - cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; - } -} - // returns BAM file pointers to beginning of alignment data bool BamMultiReader::Rewind(void) { bool result = true; @@ -286,81 +376,58 @@ bool BamMultiReader::Rewind(void) { return result; } -// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail -bool BamMultiReader::CreateIndexes(bool useDefaultIndex) { - bool result = true; - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->CreateIndex(useDefaultIndex); - } - return result; +bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { + BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); + return SetRegion(region); } -// makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReader::GetHeaderText(void) const { - - string mergedHeader = ""; - map readGroups; - - // foreach extraction entry (each BAM file) - for (vector >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { - - map currentFileReadGroups; - - BamReader* reader = rs->first; +bool BamMultiReader::SetRegion(const BamRegion& region) { - stringstream header(reader->GetHeaderText()); - vector lines; - string item; - while (getline(header, item)) - lines.push_back(item); + Region = region; - for (vector::const_iterator it = lines.begin(); it != lines.end(); ++it) { + // NB: While it may make sense to track readers in which we can + // successfully SetRegion, In practice a failure of SetRegion means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. - // get next line from header, skip if empty - string headerLine = *it; - if ( headerLine.empty() ) { continue; } + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + it->first->SetRegion(region); + } - // if first file, save HD & SQ entries - if ( rs == readers.begin() ) { - if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { - mergedHeader.append(headerLine.c_str()); - mergedHeader.append(1, '\n'); - } - } + UpdateAlignments(); + return true; +} - // (for all files) append RG entries if they are unique - if ( headerLine.find("@RG") == 0 ) { - stringstream headerLineSs(headerLine); - string part, readGroupPart, readGroup; - while(std::getline(headerLineSs, part, '\t')) { - stringstream partSs(part); - string subtag; - std::getline(partSs, subtag, ':'); - if (subtag == "ID") { - std::getline(partSs, readGroup, ':'); - break; - } - } - if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries - mergedHeader.append(headerLine.c_str() ); - mergedHeader.append(1, '\n'); - readGroups[readGroup] = true; - currentFileReadGroups[readGroup] = true; - } else { - // warn iff we are reading one file and discover duplicated @RG tags in the header - // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags - if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { - cerr << "WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; - } - } - } +void BamMultiReader::UpdateAlignments(void) { + // Update Alignments + alignments.clear(); + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* br = it->first; + BamAlignment* ba = it->second; + if (br->GetNextAlignment(*ba)) { + alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), + make_pair(br, ba))); + } else { + // assume BamReader end of region / EOF } } +} - // return merged header text - return mergedHeader; +// updates the reference id stored in the BamMultiReader +// to reflect the current state of the readers +void BamMultiReader::UpdateReferenceID(void) { + // the alignments are sorted by position, so the first alignment will always have the lowest reference ID + if (alignments.begin()->second.second->RefID != CurrentRefID) { + // get the next reference id + // while there aren't any readers at the next ref id + // increment the ref id + int nextRefID = CurrentRefID; + while (alignments.begin()->second.second->RefID != nextRefID) { + ++nextRefID; + } + //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; + CurrentRefID = nextRefID; + } } // ValidateReaders checks that all the readers point to BAM files representing @@ -398,23 +465,3 @@ void BamMultiReader::ValidateReaders(void) const { } } } - -// NB: The following functions assume that we have identical references for all -// BAM files. We enforce this by invoking the above validation function -// (ValidateReaders) to verify that our reference data is the same across all -// files on Open, so we will not encounter a situation in which there is a -// mismatch and we are still live. - -// returns the number of reference sequences -const int BamMultiReader::GetReferenceCount(void) const { - return readers.front().first->GetReferenceCount(); -} - -// returns vector of reference objects -const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { - return readers.front().first->GetReferenceData(); -} - -const int BamMultiReader::GetReferenceID(const string& refName) const { - return readers.front().first->GetReferenceID(refName); -} diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index bd36d71..dce7042 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 20 July 2010 (DB) +// Last modified: 2 September 2010 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // *************************************************************************** @@ -59,8 +59,15 @@ class BamMultiReader { // indexes. // @coreMode - setup our first alignments using GetNextAlignmentCore(); // also useful for merging + // @useDefaultIndex - look for default BAM index ".bai" first. If false, + // or if ".bai" does not exist, will look for BamTools index ".bti". If + // neither exist, will open without an index bool Open(const vector filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true); + // returns whether underlying BAM readers ALL have an index loaded + // this is useful to indicate whether Jump() or SetRegion() are possible + bool IsIndexLoaded(void) const; + // performs random-access jump to reference, position bool Jump(int refID, int position = 0); diff --git a/src/toolkit/bamtools_count.cpp b/src/toolkit/bamtools_count.cpp index 28677ce..20ed3ae 100644 --- a/src/toolkit/bamtools_count.cpp +++ b/src/toolkit/bamtools_count.cpp @@ -3,16 +3,12 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 2 June 2010 +// Last modified: 3 September 2010 // --------------------------------------------------------------------------- -// Prints alignment count for BAM file -// -// ** Expand to multiple?? -// +// Prints alignment count for BAM file(s) // *************************************************************************** #include -#include #include #include @@ -53,15 +49,14 @@ CountTool::CountTool(void) , m_settings(new CountSettings) { // set program details - Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in [-region ]"); + Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "[-in -in ...] [-region ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts); - //Options::AddValueOption("-index", "BAM index filename", "the BAM index file", "", m_settings->HasBamIndexFilename, m_settings->BamIndexFilename, IO_Opts); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); - Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai or .bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); + Options::AddValueOption("-region", "REGION", "genomic region. Index file is required and is read automatically if it exists as .bai or .bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); } CountTool::~CountTool(void) { @@ -79,6 +74,7 @@ int CountTool::Run(int argc, char* argv[]) { // parse command line arguments Options::Parse(argc, argv, 1); + // if no '-in' args supplied, default to stdin if ( !m_settings->HasInput ) m_settings->InputFiles.push_back(Options::StandardIn()); @@ -87,107 +83,73 @@ int CountTool::Run(int argc, char* argv[]) { reader.Open(m_settings->InputFiles, false, true); // alignment counter + BamAlignment al; int alignmentCount(0); - // set up error handling - ostringstream errorStream(""); - bool foundError(false); - // if no region specified, count entire file if ( !m_settings->HasRegion ) { - BamAlignment al; while ( reader.GetNextAlignmentCore(al) ) ++alignmentCount; } - // more complicated - region specified + // otherwise attempt to use region as constraint else { + // if region string parses OK BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - // check if there are index files *.bai/*.bti corresponding to the input files - bool hasDefaultIndex = false; - bool hasBamtoolsIndex = false; - bool hasNoIndex = false; - int defaultIndexCount = 0; - int bamtoolsIndexCount = 0; - for (vector::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) { - - if ( Utilities::FileExists(*f + ".bai") ) { - hasDefaultIndex = true; - ++defaultIndexCount; - } - - if ( Utilities::FileExists(*f + ".bti") ) { - hasBamtoolsIndex = true; - ++bamtoolsIndexCount; - } - - if ( !hasDefaultIndex && !hasBamtoolsIndex ) { - hasNoIndex = true; - cerr << "*WARNING - could not find index file for " << *f - << ", parsing whole file(s) to get alignment counts for target region" - << " (could be slow)" << endl; - break; - } - } + // attempt to re-open reader with index files + reader.Close(); + bool openedOK = reader.Open(m_settings->InputFiles, true, true ); - // determine if index file types are heterogeneous - bool hasDifferentIndexTypes = false; - if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) { - hasDifferentIndexTypes = true; - cerr << "*WARNING - different index file formats found" - << ", parsing whole file(s) to get alignment counts for target region" - << " (could be slow)" << endl; + // if error + if ( !openedOK ) { + cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; + return 1; } - // if any input file has no index, or if input files use different index formats - // can't use BamMultiReader to jump directly (**for now**) - if ( hasNoIndex || hasDifferentIndexTypes ) { - - // read through sequentially, counting all overlapping reads - BamAlignment al; + // if index data available, we can use SetRegion + if ( reader.IsIndexLoaded() ) { + + // attempt to use SetRegion(), if failed report error + if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { + cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl; + reader.Close(); + return 1; + } + + // everything checks out, just iterate through specified region, counting alignments + while ( reader.GetNextAlignmentCore(al) ) + ++alignmentCount; + } + + // no index data available, we have to iterate through until we + // find overlapping alignments + else { while( reader.GetNextAlignmentCore(al) ) { if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && - (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) + (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) { ++alignmentCount; } } } - - // has index file for each input file (and same format) - else { - - // this is kind of a hack...? - BamMultiReader reader; - reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex ); - - if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - foundError = true; - errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl; - } else { - BamAlignment al; - while ( reader.GetNextAlignmentCore(al) ) - ++alignmentCount; - } - } - - } else { - foundError = true; - errorStream << "Could not parse REGION: " << m_settings->Region << endl; - errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + } + + // error parsing REGION string + else { + cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl; + cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + reader.Close(); + return 1; } } - - // print errors OR results - if ( foundError ) - cerr << errorStream.str() << endl; - else - cout << alignmentCount << endl; + + // print results + cout << alignmentCount << endl; // clean & exit reader.Close(); - return (int)foundError; + return 0; } diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp index af7af5f..5c8f338 100644 --- a/src/toolkit/bamtools_filter.cpp +++ b/src/toolkit/bamtools_filter.cpp @@ -561,12 +561,13 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { return false; } + // initialize return status + bool success = true; + // see if root object contains multiple filters const Json::Value filters = root["filters"]; if ( !filters.isNull() ) { - bool success = true; - // iterate over any filters found int filterIndex = 0; Json::Value::const_iterator filtersIter = filters.begin(); @@ -575,20 +576,42 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { Json::Value filter = (*filtersIter); // convert filter index to string - stringstream convert; - convert << filterIndex; - const string filterName = convert.str(); + string filterName; + + // if id tag supplied + const Json::Value id = filter["id"]; + if ( !id.isNull() ) { + filterName = id.asString(); + } + + // use array index + else { + stringstream convert; + convert << filterIndex; + filterName = convert.str(); + } // create & parse filter success &= ParseFilterObject(filterName, filter); - } + + // see if user defined "rule" for these filters + const Json::Value rule = root["rule"]; + if ( !rule.isNull() ) { + cout << "found rule: " << rule.asString() << endl; + } else { + cout << "no rule found!" << endl; + } + return success; } // otherwise, root is the only filter (just contains properties) // create & parse filter named "ROOT" - else return ParseFilterObject("ROOT", root); + else success = ParseFilterObject("ROOT", root); + + // return success/failure + return success; } @@ -613,106 +636,69 @@ bool FilterTool::FilterToolPrivate::Run(void) { bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); writer.Open(m_settings->OutputFilename, headerText, m_references, writeUncompressed); - // set up error handling - ostringstream errorStream(""); - bool foundError(false); + BamAlignment al; - // if no REGION specified, run filter on entire file contents + // if no region specified, filter entire file if ( !m_settings->HasRegion ) { - BamAlignment al; while ( reader.GetNextAlignment(al) ) { - // perform main filter check if ( CheckAlignment(al) ) writer.SaveAlignment(al); } } - // REGION specified + // otherwise attempt to use region as constraint else { - - // attempt to parse string into BamRegion struct + + // if region string parses OK BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - // check if there are index files *.bai/*.bti corresponding to the input files - bool hasDefaultIndex = false; - bool hasBamtoolsIndex = false; - bool hasNoIndex = false; - int defaultIndexCount = 0; - int bamtoolsIndexCount = 0; - for (vector::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) { - - if ( Utilities::FileExists(*f + ".bai") ) { - hasDefaultIndex = true; - ++defaultIndexCount; - } - - if ( Utilities::FileExists(*f + ".bti") ) { - hasBamtoolsIndex = true; - ++bamtoolsIndexCount; - } - - if ( !hasDefaultIndex && !hasBamtoolsIndex ) { - hasNoIndex = true; - cerr << "*WARNING - could not find index file for " << *f - << ", parsing whole file(s) to get alignment counts for target region" - << " (could be slow)" << endl; - break; - } - } - - // determine if index file types are heterogeneous - bool hasDifferentIndexTypes = false; - if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) { - hasDifferentIndexTypes = true; - cerr << "*WARNING - different index file formats found" - << ", parsing whole file(s) to get alignment counts for target region" - << " (could be slow)" << endl; - } + // attempt to re-open reader with index files + reader.Close(); + bool openedOK = reader.Open(m_settings->InputFiles, true, true ); - // if any input file has no index, or if input files use different index formats - // can't use BamMultiReader to jump directly (**for now**) - if ( hasNoIndex || hasDifferentIndexTypes ) { - - // read through sequentially, but onlt perform filter on reads overlapping REGION - BamAlignment al; - while( reader.GetNextAlignment(al) ) { - if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && - (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) - { - // perform main filter check - if ( CheckAlignment(al) ) - writer.SaveAlignment(al); - } - } + // if error + if ( !openedOK ) { + cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; + return 1; } - // has index file for each input file (and same format) - else { - - // this is kind of a hack...? - BamMultiReader reader; - reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex ); + // if index data available, we can use SetRegion + if ( reader.IsIndexLoaded() ) { + // attempt to use SetRegion(), if failed report error if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - foundError = true; - errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl; - } else { - - // filter only alignments from specified region - BamAlignment al; - while ( reader.GetNextAlignment(al) ) { - // perform main filter check + cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl; + reader.Close(); + return 1; + } + + // everything checks out, just iterate through specified region, filtering alignments + while ( reader.GetNextAlignmentCore(al) ) + if ( CheckAlignment(al) ) + writer.SaveAlignment(al); + } + + // no index data available, we have to iterate through until we + // find overlapping alignments + else { + while( reader.GetNextAlignmentCore(al) ) { + if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && + (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) + { if ( CheckAlignment(al) ) writer.SaveAlignment(al); } } } - - } else { - foundError = true; - errorStream << "Could not parse REGION: " << m_settings->Region << endl; - errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + } + + // error parsing REGION string + else { + cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl; + cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + reader.Close(); + return 1; } } diff --git a/src/toolkit/bamtools_random.cpp b/src/toolkit/bamtools_random.cpp index 2600f42..fe8914f 100644 --- a/src/toolkit/bamtools_random.cpp +++ b/src/toolkit/bamtools_random.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 20 July 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- // Grab a random subset of alignments. // *************************************************************************** @@ -23,8 +23,14 @@ using namespace BamTools; namespace BamTools { - // define constants - const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000; +// define constants +const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000; + +// utility methods for RandomTool +const int getRandomInt(const int& lowerBound, const int& upperBound) { + const int range = (upperBound - lowerBound) + 1; + return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) ); +} } // namespace BamTools @@ -66,7 +72,7 @@ RandomTool::RandomTool(void) , m_settings(new RandomSettings) { // set program details - Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in -in ...] [-out ] [-region ]"); + Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in -in ...] [-out ] [-forceCompression] [-n] [-region ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); @@ -75,7 +81,7 @@ RandomTool::RandomTool(void) Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); - Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed (currently)", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, FilterOpts, RANDOM_MAX_ALIGNMENT_COUNT); + Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, FilterOpts, RANDOM_MAX_ALIGNMENT_COUNT); Options::AddValueOption("-region", "REGION", "limit source of random alignment subset to a particular genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai or .bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); } @@ -91,42 +97,59 @@ int RandomTool::Help(void) { int RandomTool::Run(int argc, char* argv[]) { - // TODO: Handle BAM input WITHOUT index files. - // parse command line arguments Options::Parse(argc, argv, 1); - // set to default input if none provided + // set to default stdin if no input files provided if ( !m_settings->HasInput ) m_settings->InputFiles.push_back(Options::StandardIn()); - // open our BAM reader + // open our reader BamMultiReader reader; - reader.Open(m_settings->InputFiles); - string headerText = reader.GetHeaderText(); - RefVector references = reader.GetReferenceData(); - - // check that reference data is available, used for generating random jumps - if ( references.empty() ) { - cerr << "No reference data available... quitting." << endl; + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "ERROR: Could not open input BAM file(s)." << endl; + return 1; + } + + // make sure index data is available + if ( !reader.IsIndexLoaded() ) { + cerr << "ERROR: Could not load index data for all input BAM file(s)." << endl; + cerr << "\'bamtools random\' requires valid index files to provide efficient performance." << endl; reader.Close(); return 1; + } - - // see if user specified a REGION - BamRegion region; - if ( m_settings->HasRegion ) { - if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) - reader.SetRegion(region); + + // get BamReader metadata + const string headerText = reader.GetHeaderText(); + const RefVector references = reader.GetReferenceData(); + if ( references.empty() ) { + cerr << "ERROR: No reference data available - required to perform random access throughtout input file(s)." << endl; + reader.Close(); + return 1; } - // open writer + // open our writer BamWriter writer; bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed); - + if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) { + cerr << "ERROR: Could not open BamWriter." << endl; + reader.Close(); + return 1; + } + + // if user specified a REGION constraint, attempt to parse REGION string + BamRegion region; + if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) { + cerr << "ERROR: Could not parse REGION: " << m_settings->Region << endl; + cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + reader.Close(); + writer.Close(); + return 1; + } + // seed our random number generator - srand (time(NULL) ); + srand( time(NULL) ); // grab random alignments BamAlignment al; @@ -136,37 +159,31 @@ int RandomTool::Run(int argc, char* argv[]) { int randomRefId = 0; int randomPosition = 0; - // use REGION constraints to generate random refId & position + // use REGION constraints to select random refId & position if ( m_settings->HasRegion ) { - int lowestRefId = region.LeftRefID; - int highestRefId = region.RightRefID; - int rangeRefId = (highestRefId - lowestRefId) + 1; - randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1))); + // select a random refId + randomRefId = getRandomInt(region.LeftRefID, region.RightRefID); - int lowestPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 ); - int highestPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : references.at(randomRefId).RefLength - 1 ); - int rangePosition = (highestPosition - lowestPosition) + 1; - randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1))); + // select a random position based on randomRefId + const int lowerBoundPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 ); + const int upperBoundPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : (references.at(randomRefId).RefLength - 1) ); + randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } - // otherwise generate 'normal' random refId & position + // otherwise select from all possible random refId & position else { - // generate random refId - int lowestRefId = 0; - int highestRefId = references.size() - 1; - int rangeRefId = (highestRefId - lowestRefId) + 1; - randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1))); + // select random refId + randomRefId = getRandomInt(0, (int)references.size() - 1); - // generate random position - int lowestPosition = 0; - int highestPosition = references.at(randomRefId).RefLength - 1; - int rangePosition = (highestPosition - lowestPosition) + 1; - randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1))); + // select random position based on randomRefId + const int lowerBoundPosition = 0; + const int upperBoundPosition = references.at(randomRefId).RefLength - 1; + randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } - // if jump & read successful, save alignment + // if jump & read successful, save first alignment that overlaps random refId & position if ( reader.Jump(randomRefId, randomPosition) ) { while ( reader.GetNextAlignmentCore(al) ) { if ( al.RefID == randomRefId && al.Position >= randomPosition ) { @@ -178,7 +195,7 @@ int RandomTool::Run(int argc, char* argv[]) { } } - // close reader & writer + // cleanup & exit reader.Close(); writer.Close(); return 0; -- 2.39.2