1 // ***************************************************************************
2 // bamtools_resolve.cpp (c) 2011
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 June 2011
7 // ---------------------------------------------------------------------------
8 // Resolves paired-end reads (marking the IsProperPair flag as needed) in a
10 // ***************************************************************************
12 #include "bamtools_resolve.h"
14 #include <api/BamReader.h>
15 #include <api/BamWriter.h>
16 #include <api/SamConstants.h>
17 #include <api/SamHeader.h>
18 #include <utils/bamtools_options.h>
19 #include <utils/bamtools_utilities.h>
20 using namespace BamTools;
36 static const int NUM_MODELS = 8;
37 static const string READ_GROUP_TAG = "RG";
38 static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
39 static const double DEFAULT_MODEL_COUNT_THRESHOLD = 0.1;
41 // -----------------------------------------------
42 // ModelType implementation
48 vector<uint32_t> FragmentLengths;
51 ModelType(const unsigned char id)
54 // preallocate space for 10K fragments per model type
55 FragmentLengths.reserve(10000);
58 // convenience access to internal fragment lengths vector
59 vector<uint32_t>::iterator begin(void) { return FragmentLengths.begin(); }
60 vector<uint32_t>::const_iterator begin(void) const { return FragmentLengths.begin(); }
61 void clear(void) { FragmentLengths.clear(); }
62 vector<uint32_t>::iterator end(void) { return FragmentLengths.end(); }
63 vector<uint32_t>::const_iterator end(void) const { return FragmentLengths.end(); }
64 void push_back(const uint32_t& x) { FragmentLengths.push_back(x); }
65 size_t size(void) const { return FragmentLengths.size(); }
68 static const unsigned char DUMMY_ID;
71 const unsigned char ModelType::DUMMY_ID = 100;
73 bool operator>(const ModelType& lhs, const ModelType& rhs) {
74 return lhs.size() > rhs.size();
77 unsigned char CalculateModelType(const BamAlignment& al) {
79 // localize alignment's mate positions & orientations for convenience
80 const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition );
81 const int32_t m2_begin = ( al.IsFirstMate() ? al.MatePosition : al.Position );
82 const bool m1_isReverseStrand = ( al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand() );
83 const bool m2_isReverseStrand = ( al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand() );
85 // determine 'model type'
86 if ( m1_begin < m2_begin ) {
87 if ( !m1_isReverseStrand && !m2_isReverseStrand ) return 0;
88 if ( !m1_isReverseStrand && m2_isReverseStrand ) return 1;
89 if ( m1_isReverseStrand && !m2_isReverseStrand ) return 2;
90 if ( m1_isReverseStrand && m2_isReverseStrand ) return 3;
92 if ( !m2_isReverseStrand && !m1_isReverseStrand ) return 4;
93 if ( !m2_isReverseStrand && m1_isReverseStrand ) return 5;
94 if ( m2_isReverseStrand && !m1_isReverseStrand ) return 6;
95 if ( m2_isReverseStrand && m1_isReverseStrand ) return 7;
99 return ModelType::DUMMY_ID;
102 // ---------------------------------------------
103 // ReadGroupResolver implementation
105 struct ReadGroupResolver {
108 int32_t MinFragmentLength;
109 int32_t MedianFragmentLength;
110 int32_t MaxFragmentLength;
111 vector<ModelType> Models;
113 static double ConfidenceInterval;
116 ReadGroupResolver(void);
119 bool IsValidInsertSize(const BamAlignment& al) const;
120 bool IsValidOrientation(const BamAlignment& al) const;
122 // select 2 best models based on observed data
123 void DetermineTopModels(void);
125 static void SetConfidenceInterval(const double& ci);
128 double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
130 ReadGroupResolver::ReadGroupResolver(void)
131 : MinFragmentLength(0)
132 , MedianFragmentLength(0)
133 , MaxFragmentLength(0)
135 // pre-allocate space for 8 models
136 Models.reserve(NUM_MODELS);
137 for ( int i = 0; i < NUM_MODELS; ++i )
138 Models.push_back( ModelType((unsigned char)(i+1)) );
141 bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const {
142 return ( al.InsertSize >= MinFragmentLength &&
143 al.InsertSize <= MaxFragmentLength );
146 bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const {
147 const unsigned char currentModel = CalculateModelType(al);
148 return ( currentModel == Models[0].ID || currentModel == Models[1].ID );
151 void ReadGroupResolver::DetermineTopModels(void) {
153 // sort models (most common -> least common)
154 sort( Models.begin(), Models.end(), std::greater<ModelType>() );
156 // make sure that the 2 most common models are some threshold more common
157 // than the remaining models
158 const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
159 const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
160 Models[4].size() + Models[5].size() +
161 Models[6].size() + Models[7].size();
162 if ( activeModelCountSum == 0 ) return;
163 const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
164 if ( unusedPercentage > DEFAULT_MODEL_COUNT_THRESHOLD ) {
165 cerr << "ERROR: When determining whether to apply mate-pair or paired-end constraints, "
166 << "an irregularity in the alignment model counts was discovered." << endl
168 cerr << " Normal mate-pair data sets have the highest counts for alignment models: 4 & 5." << endl;
169 cerr << " Normal paired-end data sets have the highest counts for alignment models: 2 & 6." << endl;
170 cerr << " Normal solid-end data sets have the highest counts for alignment models: 1 & 8." << endl
172 cerr << " We expect that the ratio of the 6 lowest counts to the 2 highest counts to be no larger than "
173 << DEFAULT_MODEL_COUNT_THRESHOLD << ", but in this data set the ratio was " << unusedPercentage << endl
175 for ( unsigned char i = 0; i < NUM_MODELS; ++i )
176 cerr << "- alignment model " << Models[i].ID << " : " << Models[i].size() << " hits" << endl;
180 // emit a warning if the best alignment models are non-standard
181 const bool isModel1Top = (Models[0].ID == 1) || (Models[1].ID == 1);
182 const bool isModel2Top = (Models[0].ID == 2) || (Models[1].ID == 2);
183 const bool isModel4Top = (Models[0].ID == 4) || (Models[1].ID == 4);
184 const bool isModel5Top = (Models[0].ID == 5) || (Models[1].ID == 5);
185 const bool isModel6Top = (Models[0].ID == 6) || (Models[1].ID == 6);
186 const bool isModel8Top = (Models[0].ID == 8) || (Models[1].ID == 8);
188 bool isMatePair = ( isModel4Top && isModel5Top ? true : false );
189 bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false );
190 bool isSolidPair = ( isModel1Top && isModel8Top ? true : false );
193 if ( isMatePair ) cerr << "- resolving mate-pair alignments" << endl;
194 else if ( isPairedEnd ) cerr << "- resolving paired-end alignments" << endl;
195 else if ( isSolidPair ) cerr << "- resolving solid-pair alignments" << endl;
197 cerr << "- WARNING: Found a non-standard alignment model configuration. "
198 << "Using alignment models " << Models[0].ID << " & " << Models[1].ID << endl;
202 // store only the fragments from the best alignment models, then sort
203 vector<uint32_t> fragments;
204 fragments.reserve( Models[0].size() + Models[1].size() );
205 fragments.insert( fragments.end(), Models[0].begin(), Models[0].end() );
206 fragments.insert( fragments.end(), Models[1].begin(), Models[1].end() );
207 sort ( fragments.begin(), fragments.end() );
209 // clear out Model fragment data, not needed anymore
210 // keep sorted though, with IDs, we'll be coming back to that
211 for ( int i = 0; i < NUM_MODELS; ++i )
214 // determine min,median, & max fragment lengths for each read group
215 const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0;
216 const unsigned int numFragmentLengths = fragments.size();
217 if ( numFragmentLengths == 0 ) return;
219 const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
220 MinFragmentLength = fragments[minIndex];
222 const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
223 MedianFragmentLength = fragments[medianIndex];
225 const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
226 MaxFragmentLength = fragments[maxIndex];
229 void ReadGroupResolver::SetConfidenceInterval(const double& ci) {
230 ConfidenceInterval = ci;
233 // ---------------------------------------------
234 // ResolveSettings implementation
236 struct ResolveTool::ResolveSettings {
242 bool HasConfidenceInterval;
243 bool IsForceCompression;
246 string InputFilename;
247 string OutputFilename;
248 string StatsFilename;
251 double ConfidenceInterval;
254 ResolveSettings(void)
255 : HasInputFile(false)
256 , HasOutputFile(false)
257 , HasStatsFile(false)
258 , IsForceCompression(false)
259 , InputFilename(Options::StandardIn())
260 , OutputFilename(Options::StandardOut())
262 , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
266 // ---------------------------------------------
267 // ResolveToolPrivate implementation
269 struct ResolveTool::ResolveToolPrivate {
273 ResolveToolPrivate(ResolveTool::ResolveSettings* settings)
274 : m_settings(settings)
276 ~ResolveToolPrivate(void) { }
278 // 'public' interface
284 bool CalculateStats(BamReader& reader);
285 void ParseHeader(const SamHeader& header);
286 bool ParseStatsFile(void);
287 void ResolveAlignment(BamAlignment& al);
291 ResolveTool::ResolveSettings* m_settings;
292 map<string, ReadGroupResolver> m_readGroups;
295 bool ResolveTool::ResolveToolPrivate::CalculateStats(BamReader& reader) {
297 // ensure that we get a fresh BamReader
300 // read through BAM file
302 string readGroup("");
303 map<string, ReadGroupResolver>::iterator rgIter;
304 while ( reader.GetNextAlignmentCore(al) ) {
306 // skip if alignment is not paired, mapped, nor mate is mapped
307 if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() )
310 // determine model type, skip if model unknown
311 const unsigned char currentModelType = CalculateModelType(al);
312 assert( currentModelType != ModelType::DUMMY_ID );
314 // flesh out the char data, so we can retrieve its read group ID
317 // get read group from alignment
319 al.GetTag(READ_GROUP_TAG, readGroup);
321 // look up resolver for read group
322 rgIter = m_readGroups.find(readGroup);
323 if ( rgIter == m_readGroups.end() ) {
324 cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
325 << readGroup << endl;
328 ReadGroupResolver& resolver = (*rgIter).second;
330 // update stats for current read group, current model type
331 resolver.Models[currentModelType].push_back(al.InsertSize);
334 // iterate back through read groups
335 map<string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
336 for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
337 ReadGroupResolver& resolver = (*rgIter).second;
339 // calculate acceptable orientation & insert sizes for this read group
340 resolver.DetermineTopModels();
343 cerr << "----------------------------------------" << endl
344 << "ReadGroup: " << (*rgIter).first << endl
345 << "----------------------------------------" << endl
346 << "Min FL: " << resolver.MinFragmentLength << endl
347 << "Med FL: " << resolver.MedianFragmentLength << endl
348 << "Max FL: " << resolver.MaxFragmentLength << endl
353 // return reader to beginning & return success
358 void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) {
360 // iterate over header read groups, creating a 'resolver' for each
361 SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin();
362 SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd();
363 for ( ; rgIter != rgEnd; ++rgIter ) {
364 const SamReadGroup& rg = (*rgIter);
365 m_readGroups.insert( make_pair<string, ReadGroupResolver>(rg.ID, ReadGroupResolver()) );
369 bool ResolveTool::ResolveToolPrivate::ParseStatsFile(void) {
370 cerr << "ResolveTool::ParseStatsFile() ERROR - not yet implemented!" << endl;
374 void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
376 // clear proper-pair flag
377 al.SetIsProperPair(false);
379 // quit check if alignment is not from paired-end read
380 if ( !al.IsPaired() ) return;
382 // quit check if either alignment or mate are unmapped
383 if ( !al.IsMapped() || !al.IsMateMapped() ) return;
385 // quit check if map quality is 0
386 if ( al.MapQuality == 0 ) return;
388 // get read group from alignment
389 // empty string if not found, this is OK - we handle a default empty read group case
390 string readGroup("");
391 al.GetTag(READ_GROUP_TAG, readGroup);
393 // look up read group's 'resolver'
394 map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroup);
395 if ( rgIter == m_readGroups.end() ) {
396 cerr << "bamtools resolve ERROR - read group found that was not in header: "
397 << readGroup << endl;
400 const ReadGroupResolver& resolver = (*rgIter).second;
402 // quit check if pairs are not in proper orientation (tech-dependent, can differ for each RG)
403 if ( !resolver.IsValidOrientation(al) ) return;
405 // quit check if pairs are not within "reasonable" distance (differs for each RG)
406 if ( !resolver.IsValidInsertSize(al) ) return;
408 // if we get here, alignment is OK - set 'proper pair' flag
409 al.SetIsProperPair(true);
412 bool ResolveTool::ResolveToolPrivate::Run(void) {
414 ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
416 // initialize read group map with default (empty name) read group
417 m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
419 // open our BAM reader
421 if ( !reader.Open(m_settings->InputFilename) ) {
422 cerr << "bamtools resolve ERROR: could not open input BAM file: "
423 << m_settings->InputFilename << endl;
427 // retrieve header & reference dictionary info
428 const SamHeader& header = reader.GetHeader();
429 const RefVector& references = reader.GetReferenceData();
431 // parse BAM header for read groups
434 // if existing stats file provided, parse for fragment lengths
435 if ( m_settings->HasStatsFile ) {
436 if ( !ParseStatsFile() ) {
437 cerr << "bamtools resolve ERROR - could not parse stats file" << endl;
442 // otherwise calculate stats from BAM alignments
444 if ( !CalculateStats(reader) ) {
445 cerr << "bamtools resolve ERROR - could not calculate stats from BAM file" << endl;
451 // determine compression mode for BamWriter
452 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
453 !m_settings->IsForceCompression );
454 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
455 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
459 writer.SetCompressionMode(compressionMode);
460 if ( !writer.Open(m_settings->OutputFilename, header, references) ) {
461 cerr << "bamtools resolve ERROR: could not open "
462 << m_settings->OutputFilename << " for writing." << endl;
467 // plow through alignments, setting/clearing 'proper pair' flag
468 // and writing to new output BAM file
470 while ( reader.GetNextAlignment(al) ) {
471 ResolveAlignment(al);
472 writer.SaveAlignment(al);
475 // clean up & return success
481 // ---------------------------------------------
482 // ResolveTool implementation
484 ResolveTool::ResolveTool(void)
486 , m_settings(new ResolveSettings)
489 // set program details
490 Options::SetProgramInfo("bamtools resolve", "resolves paired-end reads (marking the IsProperPair flag as needed)", "[-in <filename>] [-out <filename> | [-forceCompression] ] ");
493 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
494 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFile, m_settings->InputFilename, IO_Opts, Options::StandardIn());
495 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputFile, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
496 // Options::AddValueOption("-stats", "STATS filename", "alignment stats file", "", m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts, "");
497 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression",
498 m_settings->IsForceCompression, IO_Opts);
500 OptionGroup* ResolveOpts = Options::CreateOptionGroup("Resolve Settings");
501 Options::AddValueOption("-ci", "double", "confidence interval",
502 "", m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, ResolveOpts);
505 ResolveTool::~ResolveTool(void) {
514 int ResolveTool::Help(void) {
515 Options::DisplayHelp();
519 int ResolveTool::Run(int argc, char* argv[]) {
521 // parse command line arguments
522 Options::Parse(argc, argv, 1);
524 // initialize ResolveTool
525 m_impl = new ResolveToolPrivate(m_settings);
527 // run ResolveTool, return success/failure