// ***************************************************************************
#include <iostream>
+#include <sstream>
#include <string>
#include <vector>
struct CountTool::CountSettings {
// flags
+ bool HasBamIndexFilename;
bool HasInputBamFilename;
bool HasRegion;
// filenames
- std::string InputBamFilename;
- std::string Region;
+ string BamIndexFilename;
+ string InputBamFilename;
+ string Region;
// constructor
CountSettings(void)
- : HasInputBamFilename(false)
+ : HasBamIndexFilename(false)
+ , HasInputBamFilename(false)
, HasRegion(false)
, InputBamFilename(Options::StandardIn())
{ }
, m_settings(new CountSettings)
{
// set program details
- Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in <filename> ");
+ Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in <filename> [-region REGION -index <filename>]");
// set up options
OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-index", "BAM index filename", "the BAM index file", "", m_settings->HasBamIndexFilename, m_settings->BamIndexFilename, IO_Opts);
OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
- Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+ Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for optimal performance.", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
}
CountTool::~CountTool(void) {
Options::Parse(argc, argv, 1);
//open our BAM reader
-// BamReader reader;
-// reader.Open(m_settings.InputBamFilename);
-
- // count alignments
- string startChrom;
- string stopChrom;
- int startPos;
- int stopPos;
+ BamReader reader;
+ reader.Open(m_settings->InputBamFilename);
+
+ // alignment counter
+ int alignmentCount(0);
+
+ // set up error handling
+ ostringstream errorStream("");
+ bool foundError(false);
+ // if no region specified, count entire file
if ( !m_settings->HasRegion ) {
- cerr << "Counting all alignments " << endl;
+ BamAlignment al;
+ while ( reader.GetNextAlignment(al) )
+ ++alignmentCount;
} else {
+
+ // parse region string for desired region
+ string startChrom;
+ string stopChrom;
+ int startPos;
+ int stopPos;
if ( Utilities::ParseRegionString(m_settings->Region, startChrom, startPos, stopChrom, stopPos) ) {
- cerr << "Counting only alignments in region " << m_settings->Region << endl;
- cerr << "StartChrom: " << startChrom << endl;
- cerr << "StartPos: " << startPos << endl;
- cerr << "StopChrom: " << stopChrom << endl;
- cerr << "StopPos: " << stopPos << endl;
+
+ const RefVector references = reader.GetReferenceData();
+
+ // -------------------------------
+ // validate start ref & position
+
+ int startRefID = reader.GetReferenceID(startChrom);
+ cout << "startRefID: " << startRefID << endl;
+
+ // startRefID not found
+ if ( startRefID == (int)references.size() ) {
+ foundError = true;
+ errorStream << "Start chrom: " << startChrom << " not found in BAM file." << endl;
+ }
+
+ // valid startRefID, check startPos
+ else {
+ const RefData& reference = references.at(startRefID);
+
+ // startPos too large
+ if ( startPos > reference.RefLength ) {
+ foundError = true;
+ errorStream << "Start pos: " << startPos << " is larger than expected." << endl;
+ }
+ }
+
+ // -------------------------------
+ // validate stop ref & position
+
+ int stopRefID = reader.GetReferenceID(stopChrom);
+
+ // skip if error already found
+ if ( !foundError ) {
+
+ // stopRefID not found
+ if ( stopRefID == (int)references.size() ) {
+ foundError = true;
+ errorStream << "Stop chrom: " << stopChrom << " not found in BAM file." << endl;
+ }
+
+ // valid stopRefID, check stopPos
+ else {
+ const RefData& reference = references.at(stopRefID);
+
+ // stopPos too large
+ if ( stopPos > reference.RefLength ) {
+ foundError = true;
+ errorStream << "Stop pos: " << stopPos << " is larger than expected." << endl;
+ }
+
+ // no stopPos specified, set to reference end
+ else if ( stopPos == -1 ) {
+ stopPos = reference.RefLength;
+ }
+ }
+ }
+
+ // -------------------------------
+ // if refs & positions valid, retrieve data
+
+ if ( !foundError ) {
+
+ // has index, we can jump directly to
+ if ( m_settings->HasBamIndexFilename ) {
+
+ // this is kind of a hack...?
+ // re-open BamReader, this time with the index file
+ reader.Close();
+ reader.Open(m_settings->InputBamFilename, m_settings->BamIndexFilename);
+
+ // attempt Jump(), catch error
+ if ( !reader.Jump(startRefID, startPos) ) {
+ foundError = true;
+ errorStream << "Could not jump to desired REGION: " << m_settings->Region << endl;
+ }
+ }
+
+ else {
+
+ // read through sequentially, until first overlapping read is found
+ BamAlignment al;
+ bool alignmentFound(false);
+ while( reader.GetNextAlignment(al) ) {
+ if ( (al.RefID == startRefID ) && ( (al.Position + al.Length) >= startPos) ) {
+ alignmentFound = true;
+ break;
+ }
+ }
+
+ // if overlapping alignment found (not EOF), increment counter
+ if ( alignmentFound) ++ alignmentCount;
+ }
+
+ // -----------------------------
+ // count alignments until
+
+ if ( !foundError ) {
+ // while valid alignment AND
+ // ( either current ref is before stopRefID OR
+ // current ref stopRefID but we're before stopPos )
+ BamAlignment al;
+ while ( reader.GetNextAlignment(al) && ((al.RefID < stopRefID ) || (al.RefID == stopRefID && al.Position <= stopPos)) )
+ ++alignmentCount;
+ }
+ }
+ }
+
+ // could not parse REGION string, set error
+ else {
+ foundError = true;
+ errorStream << "Could not parse desired REGION: " << m_settings->Region << endl;
+ errorStream << "Please see README for details on accepted REGION formats" << endl;
}
}
- cerr << " from " << m_settings->InputBamFilename << endl;
- cerr << "FEATURE NOT YET IMPLEMENTED!" << endl;
-
+ // print errors OR results
+ if ( foundError )
+ cerr << errorStream.str() << endl;
+ else
+ cout << alignmentCount << endl;
+
// clean & exit
-// reader.Close();
- return 0;
+ reader.Close();
+ return (int)foundError;
}
\ No newline at end of file
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 26 May 2010
+// Last modified: 2 June 2010
// ---------------------------------------------------------------------------
// Merges multiple BAM files into one.
//
// flags
bool HasInputBamFilename;
bool HasOutputBamFilename;
- bool HasRegion;
+// bool HasRegion;
// filenames
vector<string> InputFiles;
// other parameters
string OutputFilename;
- string Region;
+// string Region;
// constructor
MergeSettings(void)
: HasInputBamFilename(false)
, HasOutputBamFilename(false)
- , HasRegion(false)
+// , HasRegion(false)
, OutputFilename(Options::StandardOut())
{ }
};
, m_settings(new MergeSettings)
{
// set program details
- Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in <filename> ...] [-region REGION] [-out <filename>]");
+ Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in <filename> -in <filename> ...] [-out <filename>]");
// set up options
OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts);
- Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts);
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts);
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts);
- OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
- Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+// OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
+// Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
}
MergeTool::~MergeTool(void) {
// set to default input if none provided
if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn());
-// // opens the BAM files without checking for indexes
-// BamMultiReader reader;
-// reader.Open(m_settings->InputFiles, false);
-//
-// // retrieve header & reference dictionary info
-// std::string mergedHeader = reader.GetHeaderText();
-// RefVector references = reader.GetReferenceData();
-//
-// // open BamWriter
-// BamWriter writer;
-// writer.Open(m_settings->OutputFilename, mergedHeader, references);
-//
-// // if desired region provided
-// if ( m_settings->HasRegion ) {
-// // parse region string
-// // only get alignments from this region
-// }
-//
-// // else get all alignments
-// else {
-// // store alignments to output file
-// BamAlignment bAlignment;
-// while (reader.GetNextAlignment(bAlignment)) {
-// writer.SaveAlignment(bAlignment);
-// }
-// }
-//
-// // clean & exit
-// reader.Close();
-// writer.Close();
+ // opens the BAM files without checking for indexes
+ BamMultiReader reader;
+ reader.Open(m_settings->InputFiles, false);
+
+ // retrieve header & reference dictionary info
+ std::string mergedHeader = reader.GetHeaderText();
+ RefVector references = reader.GetReferenceData();
+
+ // open BamWriter
+ BamWriter writer;
+ writer.Open(m_settings->OutputFilename, mergedHeader, references);
+
+ // store alignments to output file
+ BamAlignment bAlignment;
+ while (reader.GetNextAlignment(bAlignment)) {
+ writer.SaveAlignment(bAlignment);
+ }
+
+ // clean & exit
+ reader.Close();
+ writer.Close();
return 0;
}
// use BamReader methods to check if its valid for current BAM file
if ( foundFirstColon == string::npos ) {
startChrom = regionString;
- startPos = -1; // ** not sure about these defaults (should stopChrom == startChrom if same?)
- stopChrom = "";
+ startPos = 0;
+ stopChrom = regionString;
stopPos = -1;
return true;
}
// store contents before colon as startChrom, after as startPos
if ( foundRangeDots == string::npos ) {
startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
- stopChrom = "";
+ stopChrom = startChrom;
stopPos = -1;
return true;
}
// no second colon found
// so we have a "standard" chrom:start..stop input format (on single chrom)
if ( foundSecondColon == string::npos ) {
- stopChrom = "";
+ stopChrom = startChrom;
stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
return true;
}
// second colon found
// so we have a range requested across 2 chrom's
else {
- stopChrom = regionString.substr(foundRangeDots+2, regionString.length()-foundSecondColon-1);
+ stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
return true;
}
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 27 May 2010
+// Last modified: 2 June 2010
// ---------------------------------------------------------------------------
// Provides general utilities used by BamTools sub-tools.
// ***************************************************************************
#ifndef BAMTOOLS_UTILITIES_H
#define BAMTOOLS_UTILITIES_H
-#include <cstdlib>
-#include <iostream>
#include <string>
namespace BamTools {
-// Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables
-// Returns successful parse (true/false)
-static inline
-bool ParseRegionString(const std::string& regionString, std::string& startChrom, int& startPos, std::string& stopChrom, int& stopPos) {
-
- // shouldn't call this function with empty string but worth checking
- // checked first for clarity purposes later on, since we can assume at least some content in the string
- if ( regionString.empty() ) {
- std::cerr << "Empty REGION. Usual format (e.g. chr2:1000..2000). See README for more detailed uses." << std::endl;
- return false;
- }
+class BamReader;
- // non-empty string, look for a colom
- size_t foundFirstColon = regionString.find(':');
-
- // no colon found
- // going to use entire contents of requested chromosome
- // just store entire region string as startChrom name
- // use BamReader methods to check if its valid for current BAM file
- if ( foundFirstColon == std::string::npos ) {
- startChrom = regionString;
- startPos = -1; // ** not sure about these defaults (should stopChrom == startChrom if same?)
- stopChrom = "";
- stopPos = -1;
- return true;
- }
-
- // colon found, so we at least have some sort of startPos requested
- else {
-
- // store start chrom from beginning to first colon
- startChrom = regionString.substr(0,foundFirstColon);
-
- // look for ".." after the colon
- size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
-
- // no dots found
- // so we have a startPos but no range
- // store contents before colon as startChrom, after as startPos
- if ( foundRangeDots == std::string::npos ) {
- startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
- stopChrom = "";
- stopPos = -1;
- return true;
- }
-
- // ".." found, so we have some sort of range selected
- else {
-
- // store startPos between first colon and range dots ".."
- startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
-
- // look for second colon
- size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
-
- // no second colon found
- // so we have a "standard" chrom:start..stop input format (on single chrom)
- if ( foundSecondColon == std::string::npos ) {
- stopChrom = "";
- stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
- return true;
- }
-
- // second colon found
- // so we have a range requested across 2 chrom's
- else {
- stopChrom = regionString.substr(foundRangeDots+2, regionString.length()-foundSecondColon-1);
- stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
- return true;
- }
- }
- }
+class Utilities {
- // shouldn't get here - any code path that does?
- // if not, what does true/false really signify?
- return false;
-}
+ public:
+ // Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables
+ // Returns successful parse (true/false)
+ static bool ParseRegionString(const std::string& regionString,
+ std::string& startChrom,
+ int& startPos,
+ std::string& stopChrom,
+ int& stopPos);
+};
} // namespace BamTools