From: Derek Date: Wed, 2 Jun 2010 18:13:24 +0000 (-0400) Subject: Implemented CountTool, cleaned up MergeTool. X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=0a62b75d4a37e7f6bb0b45ff3eff3c0d1369e067;p=bamtools.git Implemented CountTool, cleaned up MergeTool. --- diff --git a/bamtools_count.cpp b/bamtools_count.cpp index 1560e1c..47e9965 100644 --- a/bamtools_count.cpp +++ b/bamtools_count.cpp @@ -12,6 +12,7 @@ // *************************************************************************** #include +#include #include #include @@ -29,16 +30,19 @@ using namespace BamTools; struct CountTool::CountSettings { // flags + bool HasBamIndexFilename; bool HasInputBamFilename; bool HasRegion; // filenames - std::string InputBamFilename; - std::string Region; + string BamIndexFilename; + string InputBamFilename; + string Region; // constructor CountSettings(void) - : HasInputBamFilename(false) + : HasBamIndexFilename(false) + , HasInputBamFilename(false) , HasRegion(false) , InputBamFilename(Options::StandardIn()) { } @@ -52,14 +56,15 @@ CountTool::CountTool(void) , m_settings(new CountSettings) { // set program details - Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in "); + Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in [-region REGION -index ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-index", "BAM index filename", "the BAM index file", "", m_settings->HasBamIndexFilename, m_settings->BamIndexFilename, IO_Opts); OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); - Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts); + Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for optimal performance.", "", m_settings->HasRegion, m_settings->Region, FilterOpts); } CountTool::~CountTool(void) { @@ -78,31 +83,151 @@ int CountTool::Run(int argc, char* argv[]) { Options::Parse(argc, argv, 1); //open our BAM reader -// BamReader reader; -// reader.Open(m_settings.InputBamFilename); - - // count alignments - string startChrom; - string stopChrom; - int startPos; - int stopPos; + BamReader reader; + reader.Open(m_settings->InputBamFilename); + + // alignment counter + int alignmentCount(0); + + // set up error handling + ostringstream errorStream(""); + bool foundError(false); + // if no region specified, count entire file if ( !m_settings->HasRegion ) { - cerr << "Counting all alignments " << endl; + BamAlignment al; + while ( reader.GetNextAlignment(al) ) + ++alignmentCount; } else { + + // parse region string for desired region + string startChrom; + string stopChrom; + int startPos; + int stopPos; if ( Utilities::ParseRegionString(m_settings->Region, startChrom, startPos, stopChrom, stopPos) ) { - cerr << "Counting only alignments in region " << m_settings->Region << endl; - cerr << "StartChrom: " << startChrom << endl; - cerr << "StartPos: " << startPos << endl; - cerr << "StopChrom: " << stopChrom << endl; - cerr << "StopPos: " << stopPos << endl; + + const RefVector references = reader.GetReferenceData(); + + // ------------------------------- + // validate start ref & position + + int startRefID = reader.GetReferenceID(startChrom); + cout << "startRefID: " << startRefID << endl; + + // startRefID not found + if ( startRefID == (int)references.size() ) { + foundError = true; + errorStream << "Start chrom: " << startChrom << " not found in BAM file." << endl; + } + + // valid startRefID, check startPos + else { + const RefData& reference = references.at(startRefID); + + // startPos too large + if ( startPos > reference.RefLength ) { + foundError = true; + errorStream << "Start pos: " << startPos << " is larger than expected." << endl; + } + } + + // ------------------------------- + // validate stop ref & position + + int stopRefID = reader.GetReferenceID(stopChrom); + + // skip if error already found + if ( !foundError ) { + + // stopRefID not found + if ( stopRefID == (int)references.size() ) { + foundError = true; + errorStream << "Stop chrom: " << stopChrom << " not found in BAM file." << endl; + } + + // valid stopRefID, check stopPos + else { + const RefData& reference = references.at(stopRefID); + + // stopPos too large + if ( stopPos > reference.RefLength ) { + foundError = true; + errorStream << "Stop pos: " << stopPos << " is larger than expected." << endl; + } + + // no stopPos specified, set to reference end + else if ( stopPos == -1 ) { + stopPos = reference.RefLength; + } + } + } + + // ------------------------------- + // if refs & positions valid, retrieve data + + if ( !foundError ) { + + // has index, we can jump directly to + if ( m_settings->HasBamIndexFilename ) { + + // this is kind of a hack...? + // re-open BamReader, this time with the index file + reader.Close(); + reader.Open(m_settings->InputBamFilename, m_settings->BamIndexFilename); + + // attempt Jump(), catch error + if ( !reader.Jump(startRefID, startPos) ) { + foundError = true; + errorStream << "Could not jump to desired REGION: " << m_settings->Region << endl; + } + } + + else { + + // read through sequentially, until first overlapping read is found + BamAlignment al; + bool alignmentFound(false); + while( reader.GetNextAlignment(al) ) { + if ( (al.RefID == startRefID ) && ( (al.Position + al.Length) >= startPos) ) { + alignmentFound = true; + break; + } + } + + // if overlapping alignment found (not EOF), increment counter + if ( alignmentFound) ++ alignmentCount; + } + + // ----------------------------- + // count alignments until + + if ( !foundError ) { + // while valid alignment AND + // ( either current ref is before stopRefID OR + // current ref stopRefID but we're before stopPos ) + BamAlignment al; + while ( reader.GetNextAlignment(al) && ((al.RefID < stopRefID ) || (al.RefID == stopRefID && al.Position <= stopPos)) ) + ++alignmentCount; + } + } + } + + // could not parse REGION string, set error + else { + foundError = true; + errorStream << "Could not parse desired REGION: " << m_settings->Region << endl; + errorStream << "Please see README for details on accepted REGION formats" << endl; } } - cerr << " from " << m_settings->InputBamFilename << endl; - cerr << "FEATURE NOT YET IMPLEMENTED!" << endl; - + // print errors OR results + if ( foundError ) + cerr << errorStream.str() << endl; + else + cout << alignmentCount << endl; + // clean & exit -// reader.Close(); - return 0; + reader.Close(); + return (int)foundError; } \ No newline at end of file diff --git a/bamtools_merge.cpp b/bamtools_merge.cpp index a6e79c8..402d377 100644 --- a/bamtools_merge.cpp +++ b/bamtools_merge.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 26 May 2010 +// Last modified: 2 June 2010 // --------------------------------------------------------------------------- // Merges multiple BAM files into one. // @@ -32,20 +32,20 @@ struct MergeTool::MergeSettings { // flags bool HasInputBamFilename; bool HasOutputBamFilename; - bool HasRegion; +// bool HasRegion; // filenames vector InputFiles; // other parameters string OutputFilename; - string Region; +// string Region; // constructor MergeSettings(void) : HasInputBamFilename(false) , HasOutputBamFilename(false) - , HasRegion(false) +// , HasRegion(false) , OutputFilename(Options::StandardOut()) { } }; @@ -58,15 +58,15 @@ MergeTool::MergeTool(void) , m_settings(new MergeSettings) { // set program details - Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in ...] [-region REGION] [-out ]"); + Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in -in ...] [-out ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts); - OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); - Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts); +// OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); +// Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts); } MergeTool::~MergeTool(void) { @@ -87,35 +87,26 @@ int MergeTool::Run(int argc, char* argv[]) { // set to default input if none provided if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn()); -// // opens the BAM files without checking for indexes -// BamMultiReader reader; -// reader.Open(m_settings->InputFiles, false); -// -// // retrieve header & reference dictionary info -// std::string mergedHeader = reader.GetHeaderText(); -// RefVector references = reader.GetReferenceData(); -// -// // open BamWriter -// BamWriter writer; -// writer.Open(m_settings->OutputFilename, mergedHeader, references); -// -// // if desired region provided -// if ( m_settings->HasRegion ) { -// // parse region string -// // only get alignments from this region -// } -// -// // else get all alignments -// else { -// // store alignments to output file -// BamAlignment bAlignment; -// while (reader.GetNextAlignment(bAlignment)) { -// writer.SaveAlignment(bAlignment); -// } -// } -// -// // clean & exit -// reader.Close(); -// writer.Close(); + // opens the BAM files without checking for indexes + BamMultiReader reader; + reader.Open(m_settings->InputFiles, false); + + // retrieve header & reference dictionary info + std::string mergedHeader = reader.GetHeaderText(); + RefVector references = reader.GetReferenceData(); + + // open BamWriter + BamWriter writer; + writer.Open(m_settings->OutputFilename, mergedHeader, references); + + // store alignments to output file + BamAlignment bAlignment; + while (reader.GetNextAlignment(bAlignment)) { + writer.SaveAlignment(bAlignment); + } + + // clean & exit + reader.Close(); + writer.Close(); return 0; } diff --git a/bamtools_utilities.cpp b/bamtools_utilities.cpp index 8eea9a6..b5440df 100644 --- a/bamtools_utilities.cpp +++ b/bamtools_utilities.cpp @@ -35,8 +35,8 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom // use BamReader methods to check if its valid for current BAM file if ( foundFirstColon == string::npos ) { startChrom = regionString; - startPos = -1; // ** not sure about these defaults (should stopChrom == startChrom if same?) - stopChrom = ""; + startPos = 0; + stopChrom = regionString; stopPos = -1; return true; } @@ -55,7 +55,7 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom // store contents before colon as startChrom, after as startPos if ( foundRangeDots == string::npos ) { startPos = atoi( regionString.substr(foundFirstColon+1).c_str() ); - stopChrom = ""; + stopChrom = startChrom; stopPos = -1; return true; } @@ -72,7 +72,7 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom // no second colon found // so we have a "standard" chrom:start..stop input format (on single chrom) if ( foundSecondColon == string::npos ) { - stopChrom = ""; + stopChrom = startChrom; stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() ); return true; } @@ -80,7 +80,7 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom // second colon found // so we have a range requested across 2 chrom's else { - stopChrom = regionString.substr(foundRangeDots+2, regionString.length()-foundSecondColon-1); + stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2)); stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() ); return true; } diff --git a/bamtools_utilities.h b/bamtools_utilities.h index 79462c7..8a5d36c 100644 --- a/bamtools_utilities.h +++ b/bamtools_utilities.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 27 May 2010 +// Last modified: 2 June 2010 // --------------------------------------------------------------------------- // Provides general utilities used by BamTools sub-tools. // *************************************************************************** @@ -11,89 +11,23 @@ #ifndef BAMTOOLS_UTILITIES_H #define BAMTOOLS_UTILITIES_H -#include -#include #include namespace BamTools { -// Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables -// Returns successful parse (true/false) -static inline -bool ParseRegionString(const std::string& regionString, std::string& startChrom, int& startPos, std::string& stopChrom, int& stopPos) { - - // shouldn't call this function with empty string but worth checking - // checked first for clarity purposes later on, since we can assume at least some content in the string - if ( regionString.empty() ) { - std::cerr << "Empty REGION. Usual format (e.g. chr2:1000..2000). See README for more detailed uses." << std::endl; - return false; - } +class BamReader; - // non-empty string, look for a colom - size_t foundFirstColon = regionString.find(':'); - - // no colon found - // going to use entire contents of requested chromosome - // just store entire region string as startChrom name - // use BamReader methods to check if its valid for current BAM file - if ( foundFirstColon == std::string::npos ) { - startChrom = regionString; - startPos = -1; // ** not sure about these defaults (should stopChrom == startChrom if same?) - stopChrom = ""; - stopPos = -1; - return true; - } - - // colon found, so we at least have some sort of startPos requested - else { - - // store start chrom from beginning to first colon - startChrom = regionString.substr(0,foundFirstColon); - - // look for ".." after the colon - size_t foundRangeDots = regionString.find("..", foundFirstColon+1); - - // no dots found - // so we have a startPos but no range - // store contents before colon as startChrom, after as startPos - if ( foundRangeDots == std::string::npos ) { - startPos = atoi( regionString.substr(foundFirstColon+1).c_str() ); - stopChrom = ""; - stopPos = -1; - return true; - } - - // ".." found, so we have some sort of range selected - else { - - // store startPos between first colon and range dots ".." - startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() ); - - // look for second colon - size_t foundSecondColon = regionString.find(':', foundRangeDots+1); - - // no second colon found - // so we have a "standard" chrom:start..stop input format (on single chrom) - if ( foundSecondColon == std::string::npos ) { - stopChrom = ""; - stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() ); - return true; - } - - // second colon found - // so we have a range requested across 2 chrom's - else { - stopChrom = regionString.substr(foundRangeDots+2, regionString.length()-foundSecondColon-1); - stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() ); - return true; - } - } - } +class Utilities { - // shouldn't get here - any code path that does? - // if not, what does true/false really signify? - return false; -} + public: + // Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables + // Returns successful parse (true/false) + static bool ParseRegionString(const std::string& regionString, + std::string& startChrom, + int& startPos, + std::string& stopChrom, + int& stopPos); +}; } // namespace BamTools