From: Derek Date: Fri, 23 Jul 2010 02:58:15 +0000 (-0400) Subject: Implemented basic alignment stats (count/%) for most of the alignment flags X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=511ee6cca3a0a8b8aca697bf575e8c41d07f8ab8;p=bamtools.git Implemented basic alignment stats (count/%) for most of the alignment flags --- diff --git a/bamtools_stats.cpp b/bamtools_stats.cpp index 5014f3e..5ab818f 100644 --- a/bamtools_stats.cpp +++ b/bamtools_stats.cpp @@ -1,23 +1,24 @@ // *************************************************************************** -// bamtools_stats.cpp (c) 2010 Derek Barnett, Erik Garrison +// bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 22 July 2010 // --------------------------------------------------------------------------- -// Prints general statistics for a single BAM file -// -// ** Expand to multiple?? -// +// Prints general alignment statistics for BAM file(s). // *************************************************************************** +#include +#include +#include #include +#include #include +#include #include "bamtools_stats.h" #include "bamtools_options.h" -#include "BamReader.h" - +#include "BamMultiReader.h" using namespace std; using namespace BamTools; @@ -27,36 +28,241 @@ using namespace BamTools; struct StatsTool::StatsSettings { // flags - bool HasInputBamFilename; + bool HasInput; + bool IsShowingInsertSizeSummary; // filenames - string InputBamFilename; + vector InputFiles; // constructor StatsSettings(void) - : HasInputBamFilename(false) - , InputBamFilename(Options::StandardIn()) + : HasInput(false) + , IsShowingInsertSizeSummary(false) { } }; +// --------------------------------------------- +// StatsToolPrivate implementation + +struct StatsTool::StatsToolPrivate { + + // ctor & dtor + public: + StatsToolPrivate(StatsTool::StatsSettings* _settings); + ~StatsToolPrivate(void); + + // 'public' interface + public: + bool Run(void); + + // internal methods + private: + bool CalculateMedian(vector& data, double& median); + void PrintStats(void); + void ProcessAlignment(const BamAlignment& al); + + // data members + private: + StatsTool::StatsSettings* settings; + unsigned int numReads; + unsigned int numPaired; + unsigned int numProperPair; + unsigned int numMapped; + unsigned int numBothMatesMapped; + unsigned int numForwardStrand; + unsigned int numReverseStrand; + unsigned int numFirstMate; + unsigned int numSecondMate; + unsigned int numSingletons; + unsigned int numFailedQC; + unsigned int numDuplicates; + vector insertSizes; +}; + +StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* _settings) + : settings(_settings) + , numReads(0) + , numPaired(0) + , numProperPair(0) + , numMapped(0) + , numBothMatesMapped(0) + , numForwardStrand(0) + , numReverseStrand(0) + , numFirstMate(0) + , numSecondMate(0) + , numSingletons(0) + , numFailedQC(0) + , numDuplicates(0) +{ + insertSizes.reserve(100000); +} + +StatsTool::StatsToolPrivate::~StatsToolPrivate(void) { } + +bool StatsTool::StatsToolPrivate::CalculateMedian(vector& data, double& median) { // median is double in case of even data size, need to return average of middle 2 elements + + // check that data exists + if ( data.empty() ) return false; + + size_t dataSize = data.size(); + size_t middleIndex = dataSize / 2; + + vector::iterator target = data.begin() + middleIndex; + nth_element(data.begin(), target, data.end()); + + // odd number of elements + if ( (dataSize % 2) != 0) { + median = (double)(*target); + return true; + } + + // even number of elements + else { + double rightTarget = (double)(*target); + vector::iterator leftTarget = target - 1; + nth_element(data.begin(), leftTarget, data.end()); + median = (double)((rightTarget+*leftTarget)/2.0); + return true; + } +} + +// print BAM file alignment stats +void StatsTool::StatsToolPrivate::PrintStats(void) { + + cout << endl; + cout << "**********************************************" << endl; + cout << "Stats for BAM file(s): " << endl; + cout << "**********************************************" << endl; + cout << endl; + cout << "Total reads: " << numReads << endl; + cout << "Mapped reads: " << numMapped << "\t(" << ((float)numMapped/numReads)*100 << "%)" << endl; + cout << "Forward strand: " << numForwardStrand << "\t(" << ((float)numForwardStrand/numReads)*100 << "%)" << endl; + cout << "Reverse strand: " << numReverseStrand << "\t(" << ((float)numReverseStrand/numReads)*100 << "%)" << endl; + cout << "Failed QC: " << numFailedQC << "\t(" << ((float)numFailedQC/numReads)*100 << "%)" << endl; + cout << "Duplicates: " << numDuplicates << "\t(" << ((float)numDuplicates/numReads)*100 << "%)" << endl; + cout << "Paired-end reads: " << numPaired << "\t(" << ((float)numPaired/numReads)*100 << "%)" << endl; + + if ( numPaired != 0 ) { + cout << "'Proper-pairs': " << numProperPair << "\t(" << ((float)numProperPair/numPaired)*100 << "%)" << endl; + cout << "Both pairs mapped: " << numBothMatesMapped << "\t(" << ((float)numBothMatesMapped/numPaired)*100 << "%)" << endl; + cout << "Read 1: " << numFirstMate << endl; + cout << "Read 2: " << numSecondMate << endl; + cout << "Singletons: " << numSingletons << "\t(" << ((float)numSingletons/numPaired)*100 << "%)" << endl; + } + + + if ( settings->IsShowingInsertSizeSummary ) { + + double avgInsertSize = 0.0; + if ( !insertSizes.empty() ) { + avgInsertSize = ( accumulate(insertSizes.begin(), insertSizes.end(), 0.0) / (double)insertSizes.size() ); + cout << "Average insert size (absolute value): " << avgInsertSize << endl; + } + + double medianInsertSize = 0.0; + if ( CalculateMedian(insertSizes, medianInsertSize) ) + cout << "Median insert size (absolute value): " << medianInsertSize << endl; + } + cout << endl; +} + +// use current input alignment to update BAM file alignment stats +void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) { + + // increment total alignment counter + ++numReads; + + // check the paired-independent flags + if ( al.IsDuplicate() ) ++numDuplicates; + if ( al.IsFailedQC() ) ++numFailedQC; + if ( al.IsMapped() ) ++numMapped; + + // check forward/reverse strand + if ( al.IsReverseStrand() ) + ++numReverseStrand; + else + ++numForwardStrand; + + // if alignment is paired-end + if ( al.IsPaired() ) { + + // increment PE counter + ++numPaired; + + // increment first mate/second mate counters + if ( al.IsFirstMate() ) ++numFirstMate; + if ( al.IsSecondMate() ) ++numSecondMate; + + // if alignment is mapped, check mate status + if ( al.IsMapped() ) { + // if mate mapped + if ( al.IsMateMapped() ) + ++numBothMatesMapped; + // else singleton + else + ++numSingletons; + } + + // check for explicit proper pair flag + if ( al.IsProperPair() ) ++numProperPair; + + // store insert size for first mate + if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) { + int insertSize = abs(al.InsertSize); + insertSizes.push_back( insertSize ); + } + } +} + +bool StatsTool::StatsToolPrivate::Run() { + + // opens the BAM files without checking for indexes + BamMultiReader reader; + if ( !reader.Open(settings->InputFiles, false, true) ) { + cerr << "Could not open input BAM file(s)... quitting." << endl; + reader.Close(); + return false; + } + + // plow through file, keeping track of stats + BamAlignment al; + while ( reader.GetNextAlignmentCore(al) ) { + ProcessAlignment(al); + } + + // print stats + PrintStats(); + + // clean and exit + reader.Close(); + return true; +} + // --------------------------------------------- // StatsTool implementation StatsTool::StatsTool(void) : AbstractTool() , m_settings(new StatsSettings) + , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools stats", "prints general stats for a BAM file", "[-in ]"); + Options::SetProgramInfo("bamtools stats", "prints general alignment statistics", "[-in -in ... ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + + OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats"); + Options::AddOption("-insert", "summarize insert size data", m_settings->IsShowingInsertSizeSummary, AdditionalOpts); } StatsTool::~StatsTool(void) { delete m_settings; m_settings = 0; + + delete m_impl; + m_impl = 0; } int StatsTool::Help(void) { @@ -69,7 +275,13 @@ int StatsTool::Run(int argc, char* argv[]) { // parse command line arguments Options::Parse(argc, argv, 1); - // calculate stats + // set to default input if none provided + if ( !m_settings->HasInput ) + m_settings->InputFiles.push_back(Options::StandardIn()); - return 0; + // run internal SortTool implementation, return success/fail + m_impl = new StatsToolPrivate(m_settings); + + if ( m_impl->Run() ) return 0; + else return 1; } diff --git a/bamtools_stats.h b/bamtools_stats.h index 9cbd9e3..16dc252 100644 --- a/bamtools_stats.h +++ b/bamtools_stats.h @@ -31,6 +31,9 @@ class StatsTool : public AbstractTool { private: struct StatsSettings; StatsSettings* m_settings; + + struct StatsToolPrivate; + StatsToolPrivate* m_impl; }; } // namespace BamTools