1 // ***************************************************************************
2 // bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 10 December 2012
6 // ---------------------------------------------------------------------------
7 // Prints general alignment statistics for BAM file(s).
8 // ***************************************************************************
10 #include "bamtools_stats.h"
12 #include <api/BamMultiReader.h>
13 #include <utils/bamtools_options.h>
14 using namespace BamTools;
26 // ---------------------------------------------
27 // StatsSettings implementation
29 struct StatsTool::StatsSettings {
33 bool HasInputFilelist;
34 bool IsShowingInsertSizeSummary;
37 vector<string> InputFiles;
43 , HasInputFilelist(false)
44 , IsShowingInsertSizeSummary(false)
48 // ---------------------------------------------
49 // StatsToolPrivate implementation
51 struct StatsTool::StatsToolPrivate {
55 StatsToolPrivate(StatsTool::StatsSettings* _settings);
56 ~StatsToolPrivate(void) { }
64 bool CalculateMedian(vector<int>& data, double& median);
65 void PrintStats(void);
66 void ProcessAlignment(const BamAlignment& al);
70 StatsTool::StatsSettings* m_settings;
71 unsigned int m_numReads;
72 unsigned int m_numPaired;
73 unsigned int m_numProperPair;
74 unsigned int m_numMapped;
75 unsigned int m_numBothMatesMapped;
76 unsigned int m_numForwardStrand;
77 unsigned int m_numReverseStrand;
78 unsigned int m_numFirstMate;
79 unsigned int m_numSecondMate;
80 unsigned int m_numSingletons;
81 unsigned int m_numFailedQC;
82 unsigned int m_numDuplicates;
83 vector<int> m_insertSizes;
86 StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* settings)
87 : m_settings(settings)
92 , m_numBothMatesMapped(0)
93 , m_numForwardStrand(0)
94 , m_numReverseStrand(0)
101 m_insertSizes.reserve(100000);
104 // median is of type double because in the case of even number of data elements,
105 // we need to return the average of middle 2 elements
106 bool StatsTool::StatsToolPrivate::CalculateMedian(vector<int>& data, double& median) {
108 // skip if data empty
112 // find middle element
113 size_t middleIndex = data.size() / 2;
114 vector<int>::iterator target = data.begin() + middleIndex;
115 nth_element(data.begin(), target, data.end());
117 // odd number of elements
118 if ( (data.size() % 2) != 0) {
119 median = (double)(*target);
123 // even number of elements
125 double rightTarget = (double)(*target);
126 vector<int>::iterator leftTarget = target - 1;
127 nth_element(data.begin(), leftTarget, data.end());
128 median = (double)((rightTarget+*leftTarget)/2.0);
133 // print BAM file alignment stats
134 void StatsTool::StatsToolPrivate::PrintStats(void) {
137 cout << "**********************************************" << endl;
138 cout << "Stats for BAM file(s): " << endl;
139 cout << "**********************************************" << endl;
141 cout << "Total reads: " << m_numReads << endl;
142 cout << "Mapped reads: " << m_numMapped << "\t(" << ((float)m_numMapped/m_numReads)*100 << "%)" << endl;
143 cout << "Forward strand: " << m_numForwardStrand << "\t(" << ((float)m_numForwardStrand/m_numReads)*100 << "%)" << endl;
144 cout << "Reverse strand: " << m_numReverseStrand << "\t(" << ((float)m_numReverseStrand/m_numReads)*100 << "%)" << endl;
145 cout << "Failed QC: " << m_numFailedQC << "\t(" << ((float)m_numFailedQC/m_numReads)*100 << "%)" << endl;
146 cout << "Duplicates: " << m_numDuplicates << "\t(" << ((float)m_numDuplicates/m_numReads)*100 << "%)" << endl;
147 cout << "Paired-end reads: " << m_numPaired << "\t(" << ((float)m_numPaired/m_numReads)*100 << "%)" << endl;
149 if ( m_numPaired != 0 ) {
150 cout << "'Proper-pairs': " << m_numProperPair << "\t(" << ((float)m_numProperPair/m_numPaired)*100 << "%)" << endl;
151 cout << "Both pairs mapped: " << m_numBothMatesMapped << "\t(" << ((float)m_numBothMatesMapped/m_numPaired)*100 << "%)" << endl;
152 cout << "Read 1: " << m_numFirstMate << endl;
153 cout << "Read 2: " << m_numSecondMate << endl;
154 cout << "Singletons: " << m_numSingletons << "\t(" << ((float)m_numSingletons/m_numPaired)*100 << "%)" << endl;
157 if ( m_settings->IsShowingInsertSizeSummary ) {
159 double avgInsertSize = 0.0;
160 if ( !m_insertSizes.empty() ) {
161 avgInsertSize = ( accumulate(m_insertSizes.begin(), m_insertSizes.end(), 0.0) / (double)m_insertSizes.size() );
162 cout << "Average insert size (absolute value): " << avgInsertSize << endl;
165 double medianInsertSize = 0.0;
166 if ( CalculateMedian(m_insertSizes, medianInsertSize) )
167 cout << "Median insert size (absolute value): " << medianInsertSize << endl;
172 // use current input alignment to update BAM file alignment stats
173 void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) {
175 // increment total alignment counter
178 // incrememt counters for pairing-independent flags
179 if ( al.IsDuplicate() ) ++m_numDuplicates;
180 if ( al.IsFailedQC() ) ++m_numFailedQC;
181 if ( al.IsMapped() ) ++m_numMapped;
183 // increment strand counters
184 if ( al.IsReverseStrand() )
185 ++m_numReverseStrand;
187 ++m_numForwardStrand;
189 // if alignment is paired-end
190 if ( al.IsPaired() ) {
192 // increment PE counter
195 // increment first mate/second mate counters
196 if ( al.IsFirstMate() ) ++m_numFirstMate;
197 if ( al.IsSecondMate() ) ++m_numSecondMate;
199 // if alignment is mapped, check mate status
200 if ( al.IsMapped() ) {
202 if ( al.IsMateMapped() )
203 ++m_numBothMatesMapped;
209 // check for explicit proper pair flag
210 if ( al.IsProperPair() )
213 // store insert size for first mate
214 if ( m_settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) {
215 int insertSize = abs(al.InsertSize);
216 m_insertSizes.push_back( insertSize );
221 bool StatsTool::StatsToolPrivate::Run() {
223 // set to default input if none provided
224 if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
225 m_settings->InputFiles.push_back(Options::StandardIn());
227 // add files in the filelist to the input file list
228 if ( m_settings->HasInputFilelist ) {
230 ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
231 if ( !filelist.is_open() ) {
232 cerr << "bamtools stats ERROR: could not open input BAM file list... Aborting." << endl;
237 while ( getline(filelist, line) )
238 m_settings->InputFiles.push_back(line);
241 // open the BAM files
242 BamMultiReader reader;
243 if ( !reader.Open(m_settings->InputFiles) ) {
244 cerr << "bamtools stats ERROR: could not open input BAM file(s)... Aborting." << endl;
249 // plow through alignments, keeping track of stats
251 while ( reader.GetNextAlignmentCore(al) )
252 ProcessAlignment(al);
255 // print stats & exit
260 // ---------------------------------------------
261 // StatsTool implementation
263 StatsTool::StatsTool(void)
265 , m_settings(new StatsSettings)
268 // set program details
269 Options::SetProgramInfo("bamtools stats", "prints general alignment statistics", "[-in <filename> -in <filename> ... | -list <filelist>] [statsOptions]");
272 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
273 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
274 Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
276 OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats");
277 Options::AddOption("-insert", "summarize insert size data", m_settings->IsShowingInsertSizeSummary, AdditionalOpts);
280 StatsTool::~StatsTool(void) {
289 int StatsTool::Help(void) {
290 Options::DisplayHelp();
294 int StatsTool::Run(int argc, char* argv[]) {
296 // parse command line arguments
297 Options::Parse(argc, argv, 1);
299 // initialize StatsTool with settings
300 m_impl = new StatsToolPrivate(m_settings);
302 // run StatsTool, return success/fail