// ***************************************************************************
#include <cstdlib>
-#include <iostream>
#include "bamtools_utilities.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
using namespace std;
using namespace BamTools;
-// Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables
-// Returns successful parse (true/false)
-bool Utilities::ParseRegionString(const string& regionString, string& startChrom, int& startPos, string& stopChrom, int& stopPos) {
-
- // shouldn't call this function with empty string but worth checking
- // checked first for clarity purposes later on, since we can assume at least some content in the string
- if ( regionString.empty() ) {
- cerr << "Empty REGION. Usual format (e.g. chr2:1000..2000). See README for more detailed uses." << endl;
- return false;
- }
+// Parses a region string, does validation (valid ID's, positions), stores in Region struct
+// Returns success (true/false)
+bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, Region& region) {
+
+ // -------------------------------
+ // parse region string
+ // check first for empty string
+ if ( regionString.empty() )
+ return false;
+
// non-empty string, look for a colom
size_t foundFirstColon = regionString.find(':');
+ // store chrom strings, and numeric positions
+ string startChrom;
+ string stopChrom;
+ int startPos;
+ int stopPos;
+
// no colon found
// going to use entire contents of requested chromosome
// just store entire region string as startChrom name
// use BamReader methods to check if its valid for current BAM file
if ( foundFirstColon == string::npos ) {
startChrom = regionString;
- startPos = -1; // ** not sure about these defaults (should stopChrom == startChrom if same?)
- stopChrom = "";
+ startPos = 0;
+ stopChrom = regionString;
stopPos = -1;
- return true;
}
// colon found, so we at least have some sort of startPos requested
// store contents before colon as startChrom, after as startPos
if ( foundRangeDots == string::npos ) {
startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
- stopChrom = "";
+ stopChrom = startChrom;
stopPos = -1;
- return true;
}
// ".." found, so we have some sort of range selected
// no second colon found
// so we have a "standard" chrom:start..stop input format (on single chrom)
if ( foundSecondColon == string::npos ) {
- stopChrom = "";
+ stopChrom = startChrom;
stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
- return true;
}
// second colon found
// so we have a range requested across 2 chrom's
else {
- stopChrom = regionString.substr(foundRangeDots+2, regionString.length()-foundSecondColon-1);
+ stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
- return true;
}
}
}
+
+ // -------------------------------
+ // validate reference IDs & genomic positions
+
+ const RefVector references = reader.GetReferenceData();
+
+ // if startRefID not found, return false
+ int startRefID = reader.GetReferenceID(startChrom);
+ if ( startRefID == (int)references.size() ) return false;
+
+ // if startPos is larger than reference, return false
+ const RefData& startReference = references.at(startRefID);
+ if ( startPos > startReference.RefLength ) return false;
+
+ // if stopRefID not found, return false
+ int stopRefID = reader.GetReferenceID(stopChrom);
+ if ( stopRefID == (int)references.size() ) return false;
+
+ // if stopPosition larger than reference, return false
+ const RefData& stopReference = references.at(stopRefID);
+ if ( stopPos > stopReference.RefLength ) return false;
+
+ // if no stopPosition specified, set to reference end
+ if ( stopPos == -1 ) stopPos = stopReference.RefLength;
+
+ // -------------------------------
+ // set up Region struct & return
+
+ region.StartChromID = startRefID;
+ region.StopChromID = stopRefID;
+ region.StartPosition = startPos;
+ region.StopPosition = stopPos;
+ return true;
+}
+
+// Same as ParseRegionString() above, but accepts a BamMultiReader
+bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, Region& region) {
+
+ // -------------------------------
+ // parse region string
- // shouldn't get here - any code path that does?
- // if not, what does true/false really signify?
- return false;
-}
\ No newline at end of file
+ // check first for empty string
+ if ( regionString.empty() )
+ return false;
+
+ // non-empty string, look for a colom
+ size_t foundFirstColon = regionString.find(':');
+
+ // store chrom strings, and numeric positions
+ string startChrom;
+ string stopChrom;
+ int startPos;
+ int stopPos;
+
+ // no colon found
+ // going to use entire contents of requested chromosome
+ // just store entire region string as startChrom name
+ // use BamReader methods to check if its valid for current BAM file
+ if ( foundFirstColon == string::npos ) {
+ startChrom = regionString;
+ startPos = 0;
+ stopChrom = regionString;
+ stopPos = -1;
+ }
+
+ // colon found, so we at least have some sort of startPos requested
+ else {
+
+ // store start chrom from beginning to first colon
+ startChrom = regionString.substr(0,foundFirstColon);
+
+ // look for ".." after the colon
+ size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
+
+ // no dots found
+ // so we have a startPos but no range
+ // store contents before colon as startChrom, after as startPos
+ if ( foundRangeDots == string::npos ) {
+ startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
+ stopChrom = startChrom;
+ stopPos = -1;
+ }
+
+ // ".." found, so we have some sort of range selected
+ else {
+
+ // store startPos between first colon and range dots ".."
+ startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
+
+ // look for second colon
+ size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
+
+ // no second colon found
+ // so we have a "standard" chrom:start..stop input format (on single chrom)
+ if ( foundSecondColon == string::npos ) {
+ stopChrom = startChrom;
+ stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
+ }
+
+ // second colon found
+ // so we have a range requested across 2 chrom's
+ else {
+ stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
+ stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
+ }
+ }
+ }
+
+ // -------------------------------
+ // validate reference IDs & genomic positions
+
+ const RefVector references = reader.GetReferenceData();
+
+ // if startRefID not found, return false
+ int startRefID = reader.GetReferenceID(startChrom);
+ if ( startRefID == (int)references.size() ) return false;
+
+ // if startPos is larger than reference, return false
+ const RefData& startReference = references.at(startRefID);
+ if ( startPos > startReference.RefLength ) return false;
+
+ // if stopRefID not found, return false
+ int stopRefID = reader.GetReferenceID(stopChrom);
+ if ( stopRefID == (int)references.size() ) return false;
+
+ // if stopPosition larger than reference, return false
+ const RefData& stopReference = references.at(stopRefID);
+ if ( stopPos > stopReference.RefLength ) return false;
+
+ // if no stopPosition specified, set to reference end
+ if ( stopPos == -1 ) stopPos = stopReference.RefLength;
+
+ // -------------------------------
+ // set up Region struct & return
+
+ region.StartChromID = startRefID;
+ region.StopChromID = stopRefID;
+ region.StartPosition = startPos;
+ region.StopPosition = stopPos;
+ return true;
+}