// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 7 June 2011
+// Last modified: 11 June 2011
// ---------------------------------------------------------------------------
-// Resolves paired-end reads (marking the IsProperPair flag as needed) in a
-// BAM file.
+// Resolves paired-end reads (marking the IsProperPair flag as needed).
// ***************************************************************************
#include "bamtools_resolve.h"
-
+#include "bamtools_version.h"
#include <api/BamReader.h>
#include <api/BamWriter.h>
-#include <api/SamConstants.h>
-#include <api/SamHeader.h>
#include <utils/bamtools_options.h>
#include <utils/bamtools_utilities.h>
using namespace BamTools;
#include <cassert>
#include <cctype>
#include <cstdlib>
+#include <fstream>
#include <iostream>
#include <map>
+#include <sstream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
-enum { debug = 1 };
+// --------------------------------------------------------------------------
+// general ResolveTool constants
+// --------------------------------------------------------------------------
-// constants
static const int NUM_MODELS = 8;
static const string READ_GROUP_TAG = "RG";
static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
-static const double DEFAULT_MODEL_COUNT_THRESHOLD = 0.1;
-
-// -----------------------------------------------
+static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1;
+
+// --------------------------------------------------------------------------
+// stats file constants
+// --------------------------------------------------------------------------
+
+// basic char/string constants
+static const char COMMENT_CHAR = '#';
+static const char OPEN_BRACE_CHAR = '[';
+static const char CLOSE_BRACE_CHAR = ']';
+static const char EQUAL_CHAR = '=';
+static const char TAB_CHAR = '\t';
+
+static const string WHITESPACE_CHARS = " \t\n";
+static const string TRUE_KEYWORD = "true";
+static const string FALSE_KEYWORD = "false";
+
+// field counts
+static const size_t NUM_OPTIONS_FIELDS = 2;
+static const size_t NUM_READGROUPS_FIELDS = 7;
+
+// header strings
+static const string INPUT_TOKEN = "[Input]";
+static const string OPTIONS_TOKEN = "[Options]";
+static const string READGROUPS_TOKEN = "[ReadGroups]";
+
+// option keywords
+static const string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval";
+static const string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold";
+static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups";
+
+// --------------------------------------------------------------------------
// ModelType implementation
struct ModelType {
// data members
- unsigned char ID;
+ uint16_t ID;
vector<uint32_t> FragmentLengths;
// ctor
- ModelType(const unsigned char id)
+ ModelType(const uint16_t id)
: ID(id)
{
// preallocate space for 10K fragments per model type
size_t size(void) const { return FragmentLengths.size(); }
// constants
- static const unsigned char DUMMY_ID;
+ static const uint16_t DUMMY_ID;
};
-const unsigned char ModelType::DUMMY_ID = 100;
+const uint16_t ModelType::DUMMY_ID = 100;
bool operator>(const ModelType& lhs, const ModelType& rhs) {
return lhs.size() > rhs.size();
}
-unsigned char CalculateModelType(const BamAlignment& al) {
+uint16_t CalculateModelType(const BamAlignment& al) {
// localize alignment's mate positions & orientations for convenience
const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition );
return ModelType::DUMMY_ID;
}
-// ---------------------------------------------
+// --------------------------------------------------------------------------
// ReadGroupResolver implementation
struct ReadGroupResolver {
int32_t MinFragmentLength;
int32_t MedianFragmentLength;
int32_t MaxFragmentLength;
+ uint16_t TopModelId;
+ uint16_t NextTopModelId;
+ bool IsAmbiguous;
+ bool HasData;
vector<ModelType> Models;
- static double ConfidenceInterval;
-
// ctor
ReadGroupResolver(void);
bool IsValidOrientation(const BamAlignment& al) const;
// select 2 best models based on observed data
- void DetermineTopModels(void);
+ void DetermineTopModels(const string& readGroupName);
+ // static settings
+ static double ConfidenceInterval;
+ static double UnusedModelThreshold;
static void SetConfidenceInterval(const double& ci);
+ static void SetUnusedModelThreshold(const double& umt);
};
-double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
+double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
+double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD;
ReadGroupResolver::ReadGroupResolver(void)
: MinFragmentLength(0)
, MedianFragmentLength(0)
, MaxFragmentLength(0)
+ , TopModelId(ModelType::DUMMY_ID)
+ , NextTopModelId(ModelType::DUMMY_ID)
+ , IsAmbiguous(false)
+ , HasData(false)
{
// pre-allocate space for 8 models
Models.reserve(NUM_MODELS);
- for ( int i = 0; i < NUM_MODELS; ++i )
- Models.push_back( ModelType((unsigned char)(i+1)) );
+ for ( uint16_t i = 0; i < NUM_MODELS; ++i )
+ Models.push_back( ModelType(i+1) );
}
bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const {
}
bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const {
- const unsigned char currentModel = CalculateModelType(al);
- return ( currentModel == Models[0].ID || currentModel == Models[1].ID );
+ const uint16_t currentModel = CalculateModelType(al);
+ return ( currentModel == TopModelId || currentModel == NextTopModelId );
}
-void ReadGroupResolver::DetermineTopModels(void) {
+void ReadGroupResolver::DetermineTopModels(const string& readGroupName) {
// sort models (most common -> least common)
sort( Models.begin(), Models.end(), std::greater<ModelType>() );
+ // store top 2 models for later
+ TopModelId = Models[0].ID;
+ NextTopModelId = Models[1].ID;
+
// make sure that the 2 most common models are some threshold more common
// than the remaining models
const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
+ if ( activeModelCountSum == 0 ) return; // skip if no data in this read group
const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
Models[4].size() + Models[5].size() +
- Models[6].size() + Models[7].size();
- if ( activeModelCountSum == 0 ) return;
+ Models[6].size() + Models[7].size();
const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
- if ( unusedPercentage > DEFAULT_MODEL_COUNT_THRESHOLD ) {
- cerr << "ERROR: When determining whether to apply mate-pair or paired-end constraints, "
- << "an irregularity in the alignment model counts was discovered." << endl
- << endl;
- cerr << " Normal mate-pair data sets have the highest counts for alignment models: 4 & 5." << endl;
- cerr << " Normal paired-end data sets have the highest counts for alignment models: 2 & 6." << endl;
- cerr << " Normal solid-end data sets have the highest counts for alignment models: 1 & 8." << endl
- << endl;
- cerr << " We expect that the ratio of the 6 lowest counts to the 2 highest counts to be no larger than "
- << DEFAULT_MODEL_COUNT_THRESHOLD << ", but in this data set the ratio was " << unusedPercentage << endl
- << endl;
- for ( unsigned char i = 0; i < NUM_MODELS; ++i )
- cerr << "- alignment model " << Models[i].ID << " : " << Models[i].size() << " hits" << endl;
- exit(1);
+ if ( unusedPercentage > UnusedModelThreshold ) {
+ cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'" << endl
+ << " The fraction of alignments in bottom 6 models (" << unusedPercentage
+ << ") exceeds threshold: " << UnusedModelThreshold << endl;
+ IsAmbiguous = true;
}
// emit a warning if the best alignment models are non-standard
- const bool isModel1Top = (Models[0].ID == 1) || (Models[1].ID == 1);
- const bool isModel2Top = (Models[0].ID == 2) || (Models[1].ID == 2);
- const bool isModel4Top = (Models[0].ID == 4) || (Models[1].ID == 4);
- const bool isModel5Top = (Models[0].ID == 5) || (Models[1].ID == 5);
- const bool isModel6Top = (Models[0].ID == 6) || (Models[1].ID == 6);
- const bool isModel8Top = (Models[0].ID == 8) || (Models[1].ID == 8);
+ const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1);
+ const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2);
+ const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4);
+ const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5);
+ const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6);
+ const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8);
bool isMatePair = ( isModel4Top && isModel5Top ? true : false );
bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false );
bool isSolidPair = ( isModel1Top && isModel8Top ? true : false );
- if ( debug ) {
- if ( isMatePair ) cerr << "- resolving mate-pair alignments" << endl;
- else if ( isPairedEnd ) cerr << "- resolving paired-end alignments" << endl;
- else if ( isSolidPair ) cerr << "- resolving solid-pair alignments" << endl;
- else {
- cerr << "- WARNING: Found a non-standard alignment model configuration. "
- << "Using alignment models " << Models[0].ID << " & " << Models[1].ID << endl;
- }
+ if ( !isMatePair && !isPairedEnd && !isSolidPair ) {
+ cerr << "WARNING: Found a non-standard alignment model configuration. " << endl
+ << " Using alignment models " << TopModelId << " & " << NextTopModelId
+ << endl;
}
// store only the fragments from the best alignment models, then sort
sort ( fragments.begin(), fragments.end() );
// clear out Model fragment data, not needed anymore
- // keep sorted though, with IDs, we'll be coming back to that
- for ( int i = 0; i < NUM_MODELS; ++i )
- Models[i].clear();
+ Models.clear();
// determine min,median, & max fragment lengths for each read group
const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0;
const unsigned int numFragmentLengths = fragments.size();
if ( numFragmentLengths == 0 ) return;
- const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
- MinFragmentLength = fragments[minIndex];
-
+ const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
- MedianFragmentLength = fragments[medianIndex];
+ const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
- const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
- MaxFragmentLength = fragments[maxIndex];
+ MinFragmentLength = fragments[minIndex];
+ MedianFragmentLength = fragments[medianIndex];
+ MaxFragmentLength = fragments[maxIndex];
+ HasData = true;
}
void ReadGroupResolver::SetConfidenceInterval(const double& ci) {
ConfidenceInterval = ci;
}
-// ---------------------------------------------
+void ReadGroupResolver::SetUnusedModelThreshold(const double& umt) {
+ UnusedModelThreshold = umt;
+}
+
+// --------------------------------------------------------------------------
// ResolveSettings implementation
struct ResolveTool::ResolveSettings {
// flags
- bool HasInputFile;
- bool HasOutputFile;
+ bool HasInputBamFile;
+ bool HasOutputBamFile;
+ bool IsForceCompression;
bool HasStatsFile;
+ bool IsMakeStats;
+ bool IsMarkPairs;
+ bool IsTwoPass;
bool HasConfidenceInterval;
- bool IsForceCompression;
+ bool HasUnusedModelThreshold;
// filenames
- string InputFilename;
- string OutputFilename;
+ string InputBamFilename;
+ string OutputBamFilename;
string StatsFilename;
// 'resolve options'
double ConfidenceInterval;
+ double UnusedModelThreshold;
+ bool HasForceMarkReadGroups;
// constructor
ResolveSettings(void)
- : HasInputFile(false)
- , HasOutputFile(false)
- , HasStatsFile(false)
+ : HasInputBamFile(false)
+ , HasOutputBamFile(false)
, IsForceCompression(false)
- , InputFilename(Options::StandardIn())
- , OutputFilename(Options::StandardOut())
+ , HasStatsFile(false)
+ , IsMakeStats(false)
+ , IsMarkPairs(false)
+ , IsTwoPass(false)
+ , HasConfidenceInterval(false)
+ , HasUnusedModelThreshold(false)
+ , InputBamFilename(Options::StandardIn())
+ , OutputBamFilename(Options::StandardOut())
, StatsFilename("")
, ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
+ , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
+ , HasForceMarkReadGroups(false)
{ }
};
-// ---------------------------------------------
+// --------------------------------------------------------------------------
+// StatsFileReader implementation
+
+struct ResolveTool::StatsFileReader {
+
+ // ctor & dtor
+ public:
+ StatsFileReader(void) { }
+ ~StatsFileReader(void) { Close(); }
+
+ // main reader interface
+ public:
+ void Close(void);
+ bool Open(const std::string& filename);
+ bool Read(ResolveTool::ResolveSettings* settings,
+ map<string, ReadGroupResolver>& readGroups);
+
+ // internal methods
+ private:
+ bool IsComment(const string& line) const;
+ bool IsWhitespace(const string& line) const;
+ bool ParseInputLine(const string& line);
+ bool ParseOptionLine(const string& line, ResolveTool::ResolveSettings* settings);
+ bool ParseReadGroupLine(const string& line, map<string, ReadGroupResolver>& readGroups);
+ string SkipCommentsAndWhitespace(void);
+
+ // data members
+ private:
+ ifstream m_stream;
+
+ enum State { None = 0
+ , InInput
+ , InOptions
+ , InReadGroups };
+};
+
+void ResolveTool::StatsFileReader::Close(void) {
+ if ( m_stream.is_open() )
+ m_stream.close();
+}
+
+bool ResolveTool::StatsFileReader::IsComment(const string& line) const {
+ assert( !line.empty() );
+ return ( line.at(0) == COMMENT_CHAR );
+}
+
+bool ResolveTool::StatsFileReader::IsWhitespace(const string& line) const {
+ if ( line.empty() )
+ return true;
+ return ( isspace(line.at(0)) );
+}
+
+bool ResolveTool::StatsFileReader::Open(const string& filename) {
+
+ // make sure stream is fresh
+ Close();
+
+ // attempt to open filename, return status
+ m_stream.open(filename.c_str(), ifstream::in);
+ return m_stream.good();
+}
+
+bool ResolveTool::StatsFileReader::ParseInputLine(const string& /*line*/) {
+ // input lines are ignored (for now at least), tool will use input from command line
+ return true;
+}
+
+bool ResolveTool::StatsFileReader::ParseOptionLine(const string& line,
+ ResolveTool::ResolveSettings* settings)
+{
+ // split line into option, value
+ vector<string> fields = Utilities::Split(line, EQUAL_CHAR);
+ if ( fields.size() != NUM_OPTIONS_FIELDS )
+ return false;
+ const string& option = fields.at(0);
+ stringstream value(fields.at(1));
+
+ // -----------------------------------
+ // handle option based on keyword
+
+ // ConfidenceInterval
+ if ( option == OPTION_CONFIDENCEINTERVAL ) {
+ value >> settings->ConfidenceInterval;
+ settings->HasConfidenceInterval = true;
+ return true;
+ }
+
+ // UnusedModelThreshold
+ if ( option == OPTION_UNUSEDMODELTHRESHOLD ) {
+ value >> settings->UnusedModelThreshold;
+ settings->HasUnusedModelThreshold = true;
+ return true;
+ }
+
+ // ForceMarkReadGroups
+ if ( option == OPTION_FORCEMARKREADGROUPS ) {
+ value >> settings->HasForceMarkReadGroups;
+ return true;
+ }
+
+ // otherwise unknown option
+ cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file" << endl;
+ return false;
+}
+
+bool ResolveTool::StatsFileReader::ParseReadGroupLine(const string& line,
+ map<string, ReadGroupResolver>& readGroups)
+{
+ // split read group data in to fields
+ vector<string> fields = Utilities::Split(line, WHITESPACE_CHARS);
+ if ( fields.size() != NUM_READGROUPS_FIELDS ) return false;
+
+ // retrieve RG name
+ const string& name = fields.at(0);
+
+ // populate RG's 'resolver' data
+ ReadGroupResolver resolver;
+
+ stringstream dataStream;
+ dataStream.str(fields.at(1));
+ dataStream >> resolver.MedianFragmentLength;
+ dataStream.clear();
+
+ dataStream.str(fields.at(2));
+ dataStream >> resolver.MinFragmentLength;
+ dataStream.clear();
+
+ dataStream.str(fields.at(3));
+ dataStream >> resolver.MaxFragmentLength;
+ dataStream.clear();
+
+ dataStream.str(fields.at(4));
+ dataStream >> resolver.TopModelId;
+ dataStream.clear();
+
+ dataStream.str(fields.at(5));
+ dataStream >> resolver.NextTopModelId;
+ dataStream.clear();
+
+ resolver.IsAmbiguous = ( fields.at(6) == TRUE_KEYWORD );
+
+ // store RG entry and return success
+ readGroups.insert( make_pair<string, ReadGroupResolver>(name, resolver) );
+ return true;
+}
+
+bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings,
+ map<string, ReadGroupResolver>& readGroups)
+{
+ // up-front sanity checks
+ if ( !m_stream.is_open() || settings == 0 )
+ return false;
+
+ // clear out read group data
+ readGroups.clear();
+
+ // initialize state
+ State currentState = StatsFileReader::None;
+
+ // read stats file
+ string line = SkipCommentsAndWhitespace();
+ while ( !line.empty() ) {
+
+ bool foundError = false;
+
+ // switch state on keyword found
+ if ( Utilities::StartsWith(line, INPUT_TOKEN) )
+ currentState = StatsFileReader::InInput;
+ else if ( Utilities::StartsWith(line, OPTIONS_TOKEN) )
+ currentState = StatsFileReader::InOptions;
+ else if ( Utilities::StartsWith(line, READGROUPS_TOKEN) )
+ currentState = StatsFileReader::InReadGroups;
+
+ // otherwise parse data line, depending on state
+ else {
+ if ( currentState == StatsFileReader::InInput )
+ foundError = !ParseInputLine(line);
+ else if ( currentState == StatsFileReader::InOptions )
+ foundError = !ParseOptionLine(line, settings);
+ else if ( currentState == StatsFileReader::InReadGroups )
+ foundError = !ParseReadGroupLine(line, readGroups);
+ else
+ foundError = true;
+ }
+
+ // break out if error found
+ if ( foundError )
+ return false;
+
+ // get next line
+ line = SkipCommentsAndWhitespace();
+ }
+
+ // if here, return success
+ return true;
+}
+
+string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace(void) {
+ string line;
+ do {
+ if ( m_stream.eof() )
+ return string();
+ getline(m_stream, line);
+ } while ( IsWhitespace(line) || IsComment(line) );
+ return line;
+}
+
+// --------------------------------------------------------------------------
+// StatsFileReader implementation
+
+struct ResolveTool::StatsFileWriter {
+
+ // ctor & dtor
+ public:
+ StatsFileWriter(void) { }
+ ~StatsFileWriter(void) { Close(); }
+
+ // main reader interface
+ public:
+ void Close(void);
+ bool Open(const std::string& filename);
+ bool Write(ResolveTool::ResolveSettings* settings,
+ const map<string, ReadGroupResolver>& readGroups);
+
+ // internal methods
+ private:
+ void WriteHeader(void);
+ void WriteInput(ResolveTool::ResolveSettings* settings);
+ void WriteOptions(ResolveTool::ResolveSettings* settings);
+ void WriteReadGroups(const map<string, ReadGroupResolver>& readGroups);
+
+ // data members
+ private:
+ ofstream m_stream;
+};
+
+void ResolveTool::StatsFileWriter::Close(void) {
+ if ( m_stream.is_open() )
+ m_stream.close();
+}
+
+bool ResolveTool::StatsFileWriter::Open(const string& filename) {
+
+ // make sure stream is fresh
+ Close();
+
+ // attempt to open filename, return status
+ m_stream.open(filename.c_str(), ofstream::out);
+ return m_stream.good();
+}
+
+bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings,
+ const map<string, ReadGroupResolver>& readGroups)
+{
+ // return failure if file not open
+ if ( !m_stream.is_open() )
+ return false;
+
+ // write stats file elements
+ WriteHeader();
+ WriteInput(settings);
+ WriteOptions(settings);
+ WriteReadGroups(readGroups);
+
+ // return success
+ return true;
+}
+
+void ResolveTool::StatsFileWriter::WriteHeader(void) {
+
+ // stringify current bamtools version
+ stringstream versionStream("");
+ versionStream << "v"
+ << BAMTOOLS_VERSION_MAJOR << "."
+ << BAMTOOLS_VERSION_MINOR << "."
+ << BAMTOOLS_VERSION_BUILD;
+
+ // # bamtools resolve (vX.Y.Z)
+ // \n
+
+ m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ")" << endl
+ << endl;
+}
+
+void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings) {
+
+ // [Input]
+ // filename
+ // \n
+
+ m_stream << INPUT_TOKEN << endl
+ << settings->InputBamFilename << endl
+ << endl;
+}
+
+void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings) {
+
+ // [Options]
+ // ConfidenceInterval=<double>
+ // UnusedModelThreshold=<double>
+ // ForceMarkReadGroups=<true|false>
+ // \n
+
+ m_stream << OPTIONS_TOKEN << endl
+ << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << endl
+ << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold << endl
+ << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << boolalpha << settings->HasForceMarkReadGroups << endl
+ << endl;
+}
+
+void ResolveTool::StatsFileWriter::WriteReadGroups(const map<string, ReadGroupResolver>& readGroups) {
+
+ // [ReadGroups]
+ m_stream << READGROUPS_TOKEN << endl;
+
+ // iterate over read groups
+ map<string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin();
+ map<string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end();
+ for ( ; rgIter != rgEnd; ++rgIter ) {
+ const string& name = (*rgIter).first;
+ const ReadGroupResolver& resolver = (*rgIter).second;
+
+ // skip if read group has no data
+ if ( !resolver.HasData )
+ continue;
+
+ // write read group data
+ // <name> <medianFragmentLength> <minFragmentLength> <maxFragmentLength> <topModelId> <nextTopModelId> <isAmbiguous?>
+ m_stream << name << TAB_CHAR
+ << resolver.MedianFragmentLength << TAB_CHAR
+ << resolver.MinFragmentLength << TAB_CHAR
+ << resolver.MaxFragmentLength << TAB_CHAR
+ << resolver.TopModelId << TAB_CHAR
+ << resolver.NextTopModelId << TAB_CHAR
+ << boolalpha << resolver.IsAmbiguous
+ << endl;
+ }
+
+ // extra newline at end
+ m_stream << endl;
+}
+
+// --------------------------------------------------------------------------
// ResolveToolPrivate implementation
struct ResolveTool::ResolveToolPrivate {
// internal methods
private:
- bool CalculateStats(BamReader& reader);
+ bool CheckSettings(vector<string>& errors);
+ bool MakeStats(void);
void ParseHeader(const SamHeader& header);
- bool ParseStatsFile(void);
+ bool ReadStatsFile(void);
void ResolveAlignment(BamAlignment& al);
+ bool ResolvePairs(void);
+ bool WriteStatsFile(void);
// data members
private:
map<string, ReadGroupResolver> m_readGroups;
};
-bool ResolveTool::ResolveToolPrivate::CalculateStats(BamReader& reader) {
+bool ResolveTool::ResolveToolPrivate::CheckSettings(vector<string>& errors) {
+
+ // ensure clean slate
+ errors.clear();
+
+ // if MakeStats mode
+ if ( m_settings->IsMakeStats ) {
+
+ // ensure mutex mode
+ if ( m_settings->IsMarkPairs )
+ errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
+ if ( m_settings->IsTwoPass )
+ errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
+
+ // error if output BAM options supplied
+ if ( m_settings->HasOutputBamFile )
+ errors.push_back("Cannot use -out (output BAM file) in -makeStats mode.");
+ if ( m_settings->IsForceCompression )
+ errors.push_back("Cannot use -forceCompression. No output BAM file is being generated.");
+
+ // make sure required stats file supplied
+ if ( !m_settings->HasStatsFile )
+ errors.push_back("Ouptut stats filename required for -makeStats mode. Please specify one using -stats option.");
+
+ // check for UseStats options
+ if ( m_settings->HasForceMarkReadGroups )
+ errors.push_back("Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats mode.");
+ }
+
+ // if UseStats mode
+ else if ( m_settings->IsMarkPairs ) {
+
+ // ensure mutex mode
+ if ( m_settings->IsMakeStats )
+ errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
+ if ( m_settings->IsTwoPass )
+ errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
+
+ // make sure required stats file supplied
+ if ( !m_settings->HasStatsFile )
+ errors.push_back("Input stats filename required for -markPairs mode. Please specify one using -stats option.");
+
+ // check for MakeStats options
+ if ( m_settings->HasConfidenceInterval )
+ errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode.");
+ }
+
+ // if TwoPass mode
+ else if ( m_settings->IsTwoPass ) {
+
+ // ensure mutex mode
+ if ( m_settings->IsMakeStats )
+ errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
+ if ( m_settings->IsMarkPairs )
+ errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
+
+ // make sure input is file not stdin
+ if ( !m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn() )
+ errors.push_back("Cannot run -twoPass mode with BAM data from stdin. Please specify existing file using -in option.");
+ }
+
+ // no mode selected
+ else
+ errors.push_back("No resolve mode specified. Please select ONE of the following: -makeStats, -markPairs, or -twoPass. See help for more info.");
+
+ // boundary checks on values
+ if ( m_settings->HasConfidenceInterval ) {
+ if ( m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0 )
+ errors.push_back("Invalid confidence interval. Must be between 0 and 1");
+ }
+ if ( m_settings->HasUnusedModelThreshold ) {
+ if ( m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0 )
+ errors.push_back("Invalid unused model threshold. Must be between 0 and 1");
+ }
+
+ // return success if no errors found
+ return ( errors.empty() );
+}
- // ensure that we get a fresh BamReader
- reader.Rewind();
+bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
+
+ // pull resolver settings from command-line settings
+ ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
+ ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
+
+ // open our BAM reader
+ BamReader reader;
+ if ( !reader.Open(m_settings->InputBamFilename) ) {
+ cerr << "bamtools resolve ERROR: could not open input BAM file: "
+ << m_settings->InputBamFilename << endl;
+ return false;
+ }
+
+ // retrieve header & reference dictionary info
+ const SamHeader& header = reader.GetHeader();
+
+ // parse BAM header for read groups
+ ParseHeader(header);
// read through BAM file
BamAlignment al;
continue;
// determine model type, skip if model unknown
- const unsigned char currentModelType = CalculateModelType(al);
+ const uint16_t currentModelType = CalculateModelType(al);
assert( currentModelType != ModelType::DUMMY_ID );
// flesh out the char data, so we can retrieve its read group ID
al.BuildCharData();
- // get read group from alignment
+ // get read group from alignment (OK if empty)
readGroup.clear();
al.GetTag(READ_GROUP_TAG, readGroup);
if ( rgIter == m_readGroups.end() ) {
cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
<< readGroup << endl;
+ reader.Close();
return false;
}
ReadGroupResolver& resolver = (*rgIter).second;
// iterate back through read groups
map<string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
+ const string& name = (*rgIter).first;
ReadGroupResolver& resolver = (*rgIter).second;
// calculate acceptable orientation & insert sizes for this read group
- resolver.DetermineTopModels();
-
- if ( debug ) {
- cerr << "----------------------------------------" << endl
- << "ReadGroup: " << (*rgIter).first << endl
- << "----------------------------------------" << endl
- << "Min FL: " << resolver.MinFragmentLength << endl
- << "Med FL: " << resolver.MedianFragmentLength << endl
- << "Max FL: " << resolver.MaxFragmentLength << endl
- << endl;
- }
+ resolver.DetermineTopModels(name);
}
- // return reader to beginning & return success
- reader.Rewind();
+ // close reader & return success
+ reader.Close();
return true;
}
}
}
-bool ResolveTool::ResolveToolPrivate::ParseStatsFile(void) {
- cerr << "ResolveTool::ParseStatsFile() ERROR - not yet implemented!" << endl;
- return false;
+bool ResolveTool::ResolveToolPrivate::ReadStatsFile(void) {
+
+ // skip if no filename provided
+ if ( m_settings->StatsFilename.empty() )
+ return false;
+
+ // attempt to open stats file
+ ResolveTool::StatsFileReader statsReader;
+ if ( !statsReader.Open(m_settings->StatsFilename) ) {
+ cerr << "bamtools resolve ERROR - could not open stats file: "
+ << m_settings->StatsFilename << " for reading" << endl;
+ return false;
+ }
+
+ // attempt to read stats data
+ if ( !statsReader.Read(m_settings, m_readGroups) ) {
+ cerr << "bamtools resolve ERROR - could not parse stats file: "
+ << m_settings->StatsFilename << " for data" << endl;
+ return false;
+ }
+
+ // return success
+ return true;
}
void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
// quit check if alignment is not from paired-end read
if ( !al.IsPaired() ) return;
- // quit check if either alignment or mate are unmapped
+ // quit check if either alignment or its mate are unmapped
if ( !al.IsMapped() || !al.IsMateMapped() ) return;
// quit check if map quality is 0
if ( al.MapQuality == 0 ) return;
// get read group from alignment
- // empty string if not found, this is OK - we handle a default empty read group case
- string readGroup("");
- al.GetTag(READ_GROUP_TAG, readGroup);
+ // empty string if not found, this is OK - we handle empty read group case
+ string readGroupName("");
+ al.GetTag(READ_GROUP_TAG, readGroupName);
// look up read group's 'resolver'
- map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroup);
+ map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName);
if ( rgIter == m_readGroups.end() ) {
cerr << "bamtools resolve ERROR - read group found that was not in header: "
- << readGroup << endl;
+ << readGroupName << endl;
exit(1);
}
const ReadGroupResolver& resolver = (*rgIter).second;
- // quit check if pairs are not in proper orientation (tech-dependent, can differ for each RG)
+ // quit check if pairs are not in proper orientation (can differ for each RG)
if ( !resolver.IsValidOrientation(al) ) return;
- // quit check if pairs are not within "reasonable" distance (differs for each RG)
+ // quit check if pairs are not within "reasonable" distance (can differ for each RG)
if ( !resolver.IsValidInsertSize(al) ) return;
// if we get here, alignment is OK - set 'proper pair' flag
al.SetIsProperPair(true);
}
-bool ResolveTool::ResolveToolPrivate::Run(void) {
-
- ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
-
- // initialize read group map with default (empty name) read group
- m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
+bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) {
// open our BAM reader
BamReader reader;
- if ( !reader.Open(m_settings->InputFilename) ) {
+ if ( !reader.Open(m_settings->InputBamFilename) ) {
cerr << "bamtools resolve ERROR: could not open input BAM file: "
- << m_settings->InputFilename << endl;
+ << m_settings->InputBamFilename << endl;
return false;
}
const SamHeader& header = reader.GetHeader();
const RefVector& references = reader.GetReferenceData();
- // parse BAM header for read groups
- ParseHeader(header);
-
- // if existing stats file provided, parse for fragment lengths
- if ( m_settings->HasStatsFile ) {
- if ( !ParseStatsFile() ) {
- cerr << "bamtools resolve ERROR - could not parse stats file" << endl;
- reader.Close();
- return false;
- }
- }
- // otherwise calculate stats from BAM alignments
- else {
- if ( !CalculateStats(reader) ) {
- cerr << "bamtools resolve ERROR - could not calculate stats from BAM file" << endl;
- reader.Close();
- return false;
- }
- }
-
// determine compression mode for BamWriter
- bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
+ bool writeUncompressed = ( m_settings->OutputBamFilename == Options::StandardOut() &&
!m_settings->IsForceCompression );
BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
// open BamWriter
BamWriter writer;
writer.SetCompressionMode(compressionMode);
- if ( !writer.Open(m_settings->OutputFilename, header, references) ) {
+ if ( !writer.Open(m_settings->OutputBamFilename, header, references) ) {
cerr << "bamtools resolve ERROR: could not open "
- << m_settings->OutputFilename << " for writing." << endl;
+ << m_settings->OutputBamFilename << " for writing." << endl;
reader.Close();
return false;
}
return true;
}
-// ---------------------------------------------
+bool ResolveTool::ResolveToolPrivate::Run(void) {
+
+ // verify that command line settings are acceptable
+ vector<string> errors;
+ if ( !CheckSettings(errors) ) {
+ cerr << "bamtools resolve ERROR - invalid settings: " << endl;
+ vector<string>::const_iterator errorIter = errors.begin();
+ vector<string>::const_iterator errorEnd = errors.end();
+ for ( ; errorIter != errorEnd; ++errorIter )
+ cerr << (*errorIter) << endl;
+ return false;
+ }
+
+ // initialize read group map with default (empty name) read group
+ m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
+
+ // -makeStats mode
+ if ( m_settings->IsMakeStats ) {
+
+ // generate stats data
+ if ( !MakeStats() ) {
+ cerr << "bamtools resolve ERROR - could not generate stats" << endl;
+ return false;
+ }
+
+ // write stats to file
+ if ( !WriteStatsFile() ) {
+ cerr << "bamtools resolve ERROR - could not write stats file: "
+ << m_settings->StatsFilename << endl;
+ return false;
+ }
+ }
+
+ // -markPairs mode
+ else if ( m_settings->IsMarkPairs ) {
+
+ // read stats from file
+ if ( !ReadStatsFile() ) {
+ cerr << "bamtools resolve ERROR - could not read stats file: "
+ << m_settings->StatsFilename << endl;
+ return false;
+ }
+
+ // do paired-end resolution
+ if ( !ResolvePairs() ) {
+ cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
+ return false;
+ }
+ }
+
+ // -twoPass mode
+ else {
+
+ // generate stats data
+ if ( !MakeStats() ) {
+ cerr << "bamtools resolve ERROR - could not generate stats" << endl;
+ return false;
+ }
+
+ // if stats file requested
+ if ( m_settings->HasStatsFile ) {
+
+ // write stats to file
+ // emit warning if write fails, but paired-end resolution should be allowed to proceed
+ if ( !WriteStatsFile() )
+ cerr << "bamtools resolve WARNING - could not write stats file: "
+ << m_settings->StatsFilename << endl;
+ }
+
+ // do paired-end resolution
+ if ( !ResolvePairs() ) {
+ cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
+ return false;
+ }
+ }
+
+ // return success
+ return true;
+}
+
+bool ResolveTool::ResolveToolPrivate::WriteStatsFile(void) {
+
+ // skip if no filename provided
+ if ( m_settings->StatsFilename.empty() )
+ return false;
+
+ // attempt to open stats file
+ ResolveTool::StatsFileWriter statsWriter;
+ if ( !statsWriter.Open(m_settings->StatsFilename) ) {
+ cerr << "bamtools resolve ERROR - could not open stats file: "
+ << m_settings->StatsFilename << " for writing" << endl;
+ return false;
+ }
+
+ // attempt to write stats data
+ if ( !statsWriter.Write(m_settings, m_readGroups) ) {
+ cerr << "bamtools resolve ERROR - could not write stats file: "
+ << m_settings->StatsFilename << " for data" << endl;
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// --------------------------------------------------------------------------
// ResolveTool implementation
ResolveTool::ResolveTool(void)
, m_settings(new ResolveSettings)
, m_impl(0)
{
+ // set description texts
+ const string programDescription = "resolves paired-end reads (marking the IsProperPair flag as needed)";
+ const string programUsage = "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats <filename>]";
+ const string inputBamDescription = "the input BAM file(s)";
+ const string outputBamDescription = "the output BAM file";
+ const string statsFileDescription = "input/output stats file, depending on selected mode (see below). "
+ "This file is human-readable, storing fragment length data generated per read group, as well as "
+ "the options used to configure the -makeStats mode";
+ const string forceCompressionDescription = "if results are sent to stdout (like when piping to another tool), "
+ "default behavior is to leave output uncompressed."
+ "Use this flag to override and force compression. This feature is disabled in -makeStats mode.";
+ const string makeStatsDescription = "generates a fragment-length stats file from the input BAM. "
+ "Data is written to file specified using the -stats option. "
+ "MarkPairs Mode Settings are DISABLED.";
+ const string markPairsDescription = "generates an output BAM with alignments marked with proper-pair status. "
+ "Stats data is read from file specified using the -stats option. "
+ "MakeStats Mode Settings are DISABLED";
+ const string twoPassDescription = "combines the -makeStats & -markPairs modes into a single command. "
+ "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin is DISABLED. "
+ "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, if desired. "
+ "All MakeStats & MarkPairs Mode Settings are available. "
+ "The intermediate stats file is not necessary, but if the -stats options is used, then one will be generated. "
+ "You may find this useful for documentation purposes.";
+ const string confidenceIntervalDescription = "confidence interval. Set min/max fragment lengths such that we capture "
+ "this fraction of pairs";
+ const string unusedModelThresholdDescription = "unused model threshold. The resolve tool considers 8 possible orientation models "
+ "for pairs. The top 2 are selected for later use when actually marking alignments. This value determines the "
+ "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of alignments from bottom 6 models "
+ "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. By default, NO alignments "
+ "from ambiguous read groups will be marked as proper pairs. You may override this behavior with the -force option "
+ "in -markPairs mode";
+ const string forceMarkDescription = "forces all read groups to be marked according to their top 2 'orientation models'. "
+ "When generating stats, the 2 (out of 8 possible) models with the most observations are chosen as the top models for each read group. "
+ "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), then the read group is marked as ambiguous. "
+ "The default behavior is that for an ambiguous read group, NONE of its alignments are marked as proper-pairs. "
+ "By setting this option, a read group's ambiguity flag will be ignored, and all of its alignments will be compared to the top 2 models.";
+
// set program details
- Options::SetProgramInfo("bamtools resolve", "resolves paired-end reads (marking the IsProperPair flag as needed)", "[-in <filename>] [-out <filename> | [-forceCompression] ] ");
+ Options::SetProgramInfo("bamtools resolve", programDescription, programUsage);
- // set up options
+ // set up I/O options
OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFile, m_settings->InputFilename, IO_Opts, Options::StandardIn());
- Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputFile, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
-// Options::AddValueOption("-stats", "STATS filename", "alignment stats file", "", m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts, "");
- Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression",
+ Options::AddValueOption("-in", "BAM filename", inputBamDescription, "",
+ m_settings->HasInputBamFile, m_settings->InputBamFilename,
+ IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", outputBamDescription, "",
+ m_settings->HasOutputBamFile, m_settings->OutputBamFilename,
+ IO_Opts, Options::StandardOut());
+ Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "",
+ m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts);
+ Options::AddOption("-forceCompression", forceCompressionDescription,
m_settings->IsForceCompression, IO_Opts);
- OptionGroup* ResolveOpts = Options::CreateOptionGroup("Resolve Settings");
- Options::AddValueOption("-ci", "double", "confidence interval",
- "", m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, ResolveOpts);
+ OptionGroup* ModeOpts = Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)");
+ Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts);
+ Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts);
+ Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts);
+
+ OptionGroup* MakeStatsOpts = Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)");
+ Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "",
+ m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, MakeStatsOpts);
+ Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "",
+ m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold, MakeStatsOpts);
+
+ OptionGroup* MarkPairsOpts = Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)");
+ Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups, MarkPairsOpts);
}
ResolveTool::~ResolveTool(void) {