1 // ***************************************************************************
2 // bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 March 2011
7 // ---------------------------------------------------------------------------
8 // Prints general alignment statistics for BAM file(s).
9 // ***************************************************************************
11 #include "bamtools_stats.h"
13 #include <api/BamMultiReader.h>
14 #include <utils/bamtools_options.h>
15 using namespace BamTools;
26 // ---------------------------------------------
27 // StatsSettings implementation
29 struct StatsTool::StatsSettings {
33 bool IsShowingInsertSizeSummary;
36 vector<string> InputFiles;
41 , IsShowingInsertSizeSummary(false)
45 // ---------------------------------------------
46 // StatsToolPrivate implementation
48 struct StatsTool::StatsToolPrivate {
52 StatsToolPrivate(StatsTool::StatsSettings* _settings);
53 ~StatsToolPrivate(void);
61 bool CalculateMedian(vector<int>& data, double& median);
62 void PrintStats(void);
63 void ProcessAlignment(const BamAlignment& al);
67 StatsTool::StatsSettings* settings;
68 unsigned int numReads;
69 unsigned int numPaired;
70 unsigned int numProperPair;
71 unsigned int numMapped;
72 unsigned int numBothMatesMapped;
73 unsigned int numForwardStrand;
74 unsigned int numReverseStrand;
75 unsigned int numFirstMate;
76 unsigned int numSecondMate;
77 unsigned int numSingletons;
78 unsigned int numFailedQC;
79 unsigned int numDuplicates;
80 vector<int> insertSizes;
83 StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* _settings)
89 , numBothMatesMapped(0)
98 insertSizes.reserve(100000);
101 StatsTool::StatsToolPrivate::~StatsToolPrivate(void) { }
103 // median is of type double because in the case of even number of data elements,
104 // we need to return the average of middle 2 elements
105 bool StatsTool::StatsToolPrivate::CalculateMedian(vector<int>& data, double& median) {
107 // skip if data empty
108 if ( data.empty() ) return false;
110 // find middle element
111 size_t middleIndex = data.size() / 2;
112 vector<int>::iterator target = data.begin() + middleIndex;
113 nth_element(data.begin(), target, data.end());
115 // odd number of elements
116 if ( (data.size() % 2) != 0) {
117 median = (double)(*target);
121 // even number of elements
123 double rightTarget = (double)(*target);
124 vector<int>::iterator leftTarget = target - 1;
125 nth_element(data.begin(), leftTarget, data.end());
126 median = (double)((rightTarget+*leftTarget)/2.0);
131 // print BAM file alignment stats
132 void StatsTool::StatsToolPrivate::PrintStats(void) {
135 cout << "**********************************************" << endl;
136 cout << "Stats for BAM file(s): " << endl;
137 cout << "**********************************************" << endl;
139 cout << "Total reads: " << numReads << endl;
140 cout << "Mapped reads: " << numMapped << "\t(" << ((float)numMapped/numReads)*100 << "%)" << endl;
141 cout << "Forward strand: " << numForwardStrand << "\t(" << ((float)numForwardStrand/numReads)*100 << "%)" << endl;
142 cout << "Reverse strand: " << numReverseStrand << "\t(" << ((float)numReverseStrand/numReads)*100 << "%)" << endl;
143 cout << "Failed QC: " << numFailedQC << "\t(" << ((float)numFailedQC/numReads)*100 << "%)" << endl;
144 cout << "Duplicates: " << numDuplicates << "\t(" << ((float)numDuplicates/numReads)*100 << "%)" << endl;
145 cout << "Paired-end reads: " << numPaired << "\t(" << ((float)numPaired/numReads)*100 << "%)" << endl;
147 if ( numPaired != 0 ) {
148 cout << "'Proper-pairs': " << numProperPair << "\t(" << ((float)numProperPair/numPaired)*100 << "%)" << endl;
149 cout << "Both pairs mapped: " << numBothMatesMapped << "\t(" << ((float)numBothMatesMapped/numPaired)*100 << "%)" << endl;
150 cout << "Read 1: " << numFirstMate << endl;
151 cout << "Read 2: " << numSecondMate << endl;
152 cout << "Singletons: " << numSingletons << "\t(" << ((float)numSingletons/numPaired)*100 << "%)" << endl;
155 if ( settings->IsShowingInsertSizeSummary ) {
157 double avgInsertSize = 0.0;
158 if ( !insertSizes.empty() ) {
159 avgInsertSize = ( accumulate(insertSizes.begin(), insertSizes.end(), 0.0) / (double)insertSizes.size() );
160 cout << "Average insert size (absolute value): " << avgInsertSize << endl;
163 double medianInsertSize = 0.0;
164 if ( CalculateMedian(insertSizes, medianInsertSize) )
165 cout << "Median insert size (absolute value): " << medianInsertSize << endl;
170 // use current input alignment to update BAM file alignment stats
171 void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) {
173 // increment total alignment counter
176 // check the paired-independent flags
177 if ( al.IsDuplicate() ) ++numDuplicates;
178 if ( al.IsFailedQC() ) ++numFailedQC;
179 if ( al.IsMapped() ) ++numMapped;
181 // check forward/reverse strand
182 if ( al.IsReverseStrand() )
187 // if alignment is paired-end
188 if ( al.IsPaired() ) {
190 // increment PE counter
193 // increment first mate/second mate counters
194 if ( al.IsFirstMate() ) ++numFirstMate;
195 if ( al.IsSecondMate() ) ++numSecondMate;
197 // if alignment is mapped, check mate status
198 if ( al.IsMapped() ) {
200 if ( al.IsMateMapped() )
201 ++numBothMatesMapped;
207 // check for explicit proper pair flag
208 if ( al.IsProperPair() ) ++numProperPair;
210 // store insert size for first mate
211 if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) {
212 int insertSize = abs(al.InsertSize);
213 insertSizes.push_back( insertSize );
218 bool StatsTool::StatsToolPrivate::Run() {
220 // open the BAM files
221 BamMultiReader reader;
222 if ( !reader.Open(settings->InputFiles) ) {
223 cerr << "bamtools stats ERROR: could not open input BAM file(s)... Aborting." << endl;
228 // plow through alignments, keeping track of stats
230 while ( reader.GetNextAlignmentCore(al) )
231 ProcessAlignment(al);
234 // print stats & exit
239 // ---------------------------------------------
240 // StatsTool implementation
242 StatsTool::StatsTool(void)
244 , m_settings(new StatsSettings)
247 // set program details
248 Options::SetProgramInfo("bamtools stats", "prints general alignment statistics", "[-in <filename> -in <filename> ...] [statsOptions]");
251 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
252 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
254 OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats");
255 Options::AddOption("-insert", "summarize insert size data", m_settings->IsShowingInsertSizeSummary, AdditionalOpts);
258 StatsTool::~StatsTool(void) {
266 int StatsTool::Help(void) {
267 Options::DisplayHelp();
271 int StatsTool::Run(int argc, char* argv[]) {
273 // parse command line arguments
274 Options::Parse(argc, argv, 1);
276 // set to default input if none provided
277 if ( !m_settings->HasInput )
278 m_settings->InputFiles.push_back(Options::StandardIn());
280 // run internal StatsTool implementation, return success/fail
281 m_impl = new StatsToolPrivate(m_settings);
283 if ( m_impl->Run() ) return 0;