From: Derek Date: Thu, 3 Jun 2010 17:34:13 +0000 (-0400) Subject: Cleaned up CountTool, by extending Utilities::ParseRegionString() to also do validati... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=44ac262d02ef007262d7504e963234329245dc5f;p=bamtools.git Cleaned up CountTool, by extending Utilities::ParseRegionString() to also do validation. Should be useful for other tools using REGION strings --- diff --git a/bamtools_count.cpp b/bamtools_count.cpp index c84c6ab..67d1e9b 100644 --- a/bamtools_count.cpp +++ b/bamtools_count.cpp @@ -103,123 +103,56 @@ int CountTool::Run(int argc, char* argv[]) { // more complicated - region specified else { - // parse region string for desired region - string startChrom; - string stopChrom; - int startPos; - int stopPos; - if ( Utilities::ParseRegionString(m_settings->Region, startChrom, startPos, stopChrom, stopPos) ) { + Region region; + if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - const RefVector references = reader.GetReferenceData(); - - // ------------------------------- - // validate start ref & position - - int startRefID = reader.GetReferenceID(startChrom); - - // startRefID not found - if ( startRefID == (int)references.size() ) { - foundError = true; - errorStream << "Start chrom: " << startChrom << " not found in BAM file." << endl; - } - - // valid startRefID, check startPos - else { - const RefData& reference = references.at(startRefID); - - // startPos too large - if ( startPos > reference.RefLength ) { + // has index, we can jump directly to + if ( m_settings->HasBamIndexFilename ) { + + // this is kind of a hack...? + // re-open BamReader, this time with the index file + reader.Close(); + reader.Open(m_settings->InputBamFilename, m_settings->BamIndexFilename); + + // attempt Jump(), catch error + if ( !reader.Jump(region.StartChromID, region.StartPosition) ) { foundError = true; - errorStream << "Start pos: " << startPos << " is larger than expected." << endl; + errorStream << "Could not jump to desired REGION: " << m_settings->Region << endl; } } - // ------------------------------- - // validate stop ref & position - - int stopRefID = reader.GetReferenceID(stopChrom); - - // skip if error already found - if ( !foundError ) { + else { - // stopRefID not found - if ( stopRefID == (int)references.size() ) { - foundError = true; - errorStream << "Stop chrom: " << stopChrom << " not found in BAM file." << endl; - } - - // valid stopRefID, check stopPos - else { - const RefData& reference = references.at(stopRefID); - - // stopPos too large - if ( stopPos > reference.RefLength ) { - foundError = true; - errorStream << "Stop pos: " << stopPos << " is larger than expected." << endl; - } - - // no stopPos specified, set to reference end - else if ( stopPos == -1 ) { - stopPos = reference.RefLength; - } + // read through sequentially, until first overlapping read is found + BamAlignment al; + bool alignmentFound(false); + while( reader.GetNextAlignment(al) ) { + if ( (al.RefID == region.StartChromID ) && ( (al.Position + al.Length) >= region.StartPosition) ) { + alignmentFound = true; + break; + } } + + // if overlapping alignment found (not EOF), increment counter + if ( alignmentFound) ++ alignmentCount; } - - // ------------------------------- - // if refs & positions valid, retrieve data + + // ----------------------------- + // count alignments until stop hit if ( !foundError ) { - - // has index, we can jump directly to - if ( m_settings->HasBamIndexFilename ) { - - // this is kind of a hack...? - // re-open BamReader, this time with the index file - reader.Close(); - reader.Open(m_settings->InputBamFilename, m_settings->BamIndexFilename); - - // attempt Jump(), catch error - if ( !reader.Jump(startRefID, startPos) ) { - foundError = true; - errorStream << "Could not jump to desired REGION: " << m_settings->Region << endl; - } - } - - else { - - // read through sequentially, until first overlapping read is found - BamAlignment al; - bool alignmentFound(false); - while( reader.GetNextAlignment(al) ) { - if ( (al.RefID == startRefID ) && ( (al.Position + al.Length) >= startPos) ) { - alignmentFound = true; - break; - } - } - - // if overlapping alignment found (not EOF), increment counter - if ( alignmentFound) ++ alignmentCount; - } - - // ----------------------------- - // count alignments until stop hit - - if ( !foundError ) { - // while valid alignment AND - // ( either current ref is before stopRefID OR - // current ref stopRefID but we're before stopPos ) - BamAlignment al; - while ( reader.GetNextAlignment(al) && ((al.RefID < stopRefID ) || (al.RefID == stopRefID && al.Position <= stopPos)) ) - ++alignmentCount; - } + // while valid alignment AND + // ( either current ref is before stopRefID OR + // current ref stopRefID but we're before stopPos ) + BamAlignment al; + while ( reader.GetNextAlignment(al) && ((al.RefID < region.StopChromID ) || (al.RefID == region.StopChromID && al.Position <= region.StopPosition)) ) + ++alignmentCount; } - } - - // could not parse REGION string, set error - else { + + } else { foundError = true; - errorStream << "Could not parse desired REGION: " << m_settings->Region << endl; - errorStream << "Please see README for details on accepted REGION formats" << endl; + errorStream << "Could not parse REGION: " << m_settings->Region << endl; + errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; } } diff --git a/bamtools_utilities.cpp b/bamtools_utilities.cpp index b5440df..e3ab868 100644 --- a/bamtools_utilities.cpp +++ b/bamtools_utilities.cpp @@ -9,26 +9,31 @@ // *************************************************************************** #include -#include #include "bamtools_utilities.h" +#include "BamReader.h" using namespace std; using namespace BamTools; -// Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables -// Returns successful parse (true/false) -bool Utilities::ParseRegionString(const string& regionString, string& startChrom, int& startPos, string& stopChrom, int& stopPos) { - - // shouldn't call this function with empty string but worth checking - // checked first for clarity purposes later on, since we can assume at least some content in the string - if ( regionString.empty() ) { - cerr << "Empty REGION. Usual format (e.g. chr2:1000..2000). See README for more detailed uses." << endl; - return false; - } +// Parses a region string, does validation (valid ID's, positions), stores in Region struct +bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, Region& region) { + // ------------------------------- + // parse region string + + // check first for empty string + if ( regionString.empty() ) + return false; + // non-empty string, look for a colom size_t foundFirstColon = regionString.find(':'); + // store chrom strings, and numeric positions + string startChrom; + string stopChrom; + int startPos; + int stopPos; + // no colon found // going to use entire contents of requested chromosome // just store entire region string as startChrom name @@ -38,7 +43,6 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom startPos = 0; stopChrom = regionString; stopPos = -1; - return true; } // colon found, so we at least have some sort of startPos requested @@ -57,7 +61,6 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom startPos = atoi( regionString.substr(foundFirstColon+1).c_str() ); stopChrom = startChrom; stopPos = -1; - return true; } // ".." found, so we have some sort of range selected @@ -74,7 +77,6 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom if ( foundSecondColon == string::npos ) { stopChrom = startChrom; stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() ); - return true; } // second colon found @@ -82,12 +84,41 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom else { stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2)); stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() ); - return true; } } } - - // shouldn't get here - any code path that does? - // if not, what does true/false really signify? - return false; -} \ No newline at end of file + + // ------------------------------- + // validate reference IDs & genomic positions + + const RefVector references = reader.GetReferenceData(); + + // if startRefID not found, return false + int startRefID = reader.GetReferenceID(startChrom); + if ( startRefID == (int)references.size() ) return false; + + // if startPos is larger than reference, return false + const RefData& startReference = references.at(startRefID); + if ( startPos > startReference.RefLength ) return false; + + // if stopRefID not found, return false + int stopRefID = reader.GetReferenceID(stopChrom); + if ( stopRefID == (int)references.size() ) return false; + + // if stopPosition larger than reference, return false + const RefData& stopReference = references.at(stopRefID); + if ( stopPos > stopReference.RefLength ) return false; + + // if no stopPosition specified, set to reference end + if ( stopPos == -1 )stopPos = stopReference.RefLength; + + // ------------------------------- + // set up Region struct & return + + region.StartChromID = startRefID; + region.StopChromID = stopRefID; + region.StartPosition = startPos; + region.StopPosition = stopPos; + return true; +} + diff --git a/bamtools_utilities.h b/bamtools_utilities.h index 8a5d36c..1e280f1 100644 --- a/bamtools_utilities.h +++ b/bamtools_utilities.h @@ -16,17 +16,19 @@ namespace BamTools { class BamReader; - + +struct Region { + int StartChromID; + int StopChromID; + int StartPosition; + int StopPosition; +}; + class Utilities { - public: - // Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables - // Returns successful parse (true/false) - static bool ParseRegionString(const std::string& regionString, - std::string& startChrom, - int& startPos, - std::string& stopChrom, - int& stopPos); + public: + // Parses a region string, uses reader to do validation (valid ID's, positions), stores in Region struct + static bool ParseRegionString(const std::string& regionString, const BamReader& reader, Region& region); }; } // namespace BamTools