From: derek Date: Tue, 7 Jun 2011 19:47:42 +0000 (-0400) Subject: Added new ResolveTool - for paired-end resolution. Similar to functionality provided... X-Git-Url: https://git.donarmstrong.com/?p=bamtools.git;a=commitdiff_plain;h=66ddb686203f754c010ca1538ebe69ba03f38f7f Added new ResolveTool - for paired-end resolution. Similar to functionality provided inthe deprecated (or soon to be) MosaikSort tool --- diff --git a/src/toolkit/CMakeLists.txt b/src/toolkit/CMakeLists.txt index 501e913..02dcb68 100644 --- a/src/toolkit/CMakeLists.txt +++ b/src/toolkit/CMakeLists.txt @@ -21,6 +21,7 @@ add_executable ( bamtools bamtools_index.cpp bamtools_merge.cpp bamtools_random.cpp + bamtools_resolve.cpp bamtools_revert.cpp bamtools_sort.cpp bamtools_split.cpp diff --git a/src/toolkit/bamtools.cpp b/src/toolkit/bamtools.cpp index 4875a9a..1230e04 100644 --- a/src/toolkit/bamtools.cpp +++ b/src/toolkit/bamtools.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 18 May 2011 (DB) // --------------------------------------------------------------------------- // Integrates a number of BamTools functionalities into a single executable. // *************************************************************************** @@ -16,6 +16,7 @@ #include "bamtools_index.h" #include "bamtools_merge.h" #include "bamtools_random.h" +#include "bamtools_resolve.h" #include "bamtools_revert.h" #include "bamtools_sort.h" #include "bamtools_split.h" @@ -38,6 +39,7 @@ static const string HEADER = "header"; static const string INDEX = "index"; static const string MERGE = "merge"; static const string RANDOM = "random"; +static const string RESOLVE = "resolve"; static const string REVERT = "revert"; static const string SORT = "sort"; static const string SPLIT = "split"; @@ -77,6 +79,7 @@ AbstractTool* CreateTool(const string& arg) { if ( arg == INDEX ) return new IndexTool; if ( arg == MERGE ) return new MergeTool; if ( arg == RANDOM ) return new RandomTool; + if ( arg == RESOLVE ) return new ResolveTool; if ( arg == REVERT ) return new RevertTool; if ( arg == SORT ) return new SortTool; if ( arg == SPLIT ) return new SplitTool; @@ -111,7 +114,8 @@ int Help(int argc, char* argv[]) { cerr << "\theader Prints BAM header information" << endl; cerr << "\tindex Generates index for BAM file" << endl; cerr << "\tmerge Merge multiple BAM files into single file" << endl; - cerr << "\trandom Select random alignments from existing BAM file(s)" << endl; + cerr << "\trandom Select random alignments from existing BAM file(s), intended more as a testing tool." << endl; + cerr << "\tresolve Resolves paired-end reads (marking the IsProperPair flag as needed)" << endl; cerr << "\trevert Removes duplicate marks and restores original base qualities" << endl; cerr << "\tsort Sorts the BAM file according to some criteria" << endl; cerr << "\tsplit Splits a BAM file on user-specified property, creating a new BAM output file for each value found" << endl; diff --git a/src/toolkit/bamtools_resolve.cpp b/src/toolkit/bamtools_resolve.cpp index 4c02640..1c28d66 100644 --- a/src/toolkit/bamtools_resolve.cpp +++ b/src/toolkit/bamtools_resolve.cpp @@ -3,44 +3,263 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 +// Last modified: 7 June 2011 // --------------------------------------------------------------------------- -// Resolves paired-end reads (marking the IsProperPair flag as needed) +// Resolves paired-end reads (marking the IsProperPair flag as needed) in a +// BAM file. // *************************************************************************** #include "bamtools_resolve.h" #include #include +#include +#include #include #include using namespace BamTools; +#include +#include +#include +#include #include +#include #include +#include +#include using namespace std; +enum { debug = 1 }; + +// constants +static const int NUM_MODELS = 8; +static const string READ_GROUP_TAG = "RG"; +static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973; +static const double DEFAULT_MODEL_COUNT_THRESHOLD = 0.1; + +// ----------------------------------------------- +// ModelType implementation + +struct ModelType { + + // data members + unsigned char ID; + vector FragmentLengths; + + // ctor + ModelType(const unsigned char id) + : ID(id) + { + // preallocate space for 10K fragments per model type + FragmentLengths.reserve(10000); + } + + // convenience access to internal fragment lengths vector + vector::iterator begin(void) { return FragmentLengths.begin(); } + vector::const_iterator begin(void) const { return FragmentLengths.begin(); } + void clear(void) { FragmentLengths.clear(); } + vector::iterator end(void) { return FragmentLengths.end(); } + vector::const_iterator end(void) const { return FragmentLengths.end(); } + void push_back(const uint32_t& x) { FragmentLengths.push_back(x); } + size_t size(void) const { return FragmentLengths.size(); } + + // constants + static const unsigned char DUMMY_ID; +}; + +const unsigned char ModelType::DUMMY_ID = 100; + +bool operator>(const ModelType& lhs, const ModelType& rhs) { + return lhs.size() > rhs.size(); +} + +unsigned char CalculateModelType(const BamAlignment& al) { + + // localize alignment's mate positions & orientations for convenience + const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition ); + const int32_t m2_begin = ( al.IsFirstMate() ? al.MatePosition : al.Position ); + const bool m1_isReverseStrand = ( al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand() ); + const bool m2_isReverseStrand = ( al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand() ); + + // determine 'model type' + if ( m1_begin < m2_begin ) { + if ( !m1_isReverseStrand && !m2_isReverseStrand ) return 0; + if ( !m1_isReverseStrand && m2_isReverseStrand ) return 1; + if ( m1_isReverseStrand && !m2_isReverseStrand ) return 2; + if ( m1_isReverseStrand && m2_isReverseStrand ) return 3; + } else { + if ( !m2_isReverseStrand && !m1_isReverseStrand ) return 4; + if ( !m2_isReverseStrand && m1_isReverseStrand ) return 5; + if ( m2_isReverseStrand && !m1_isReverseStrand ) return 6; + if ( m2_isReverseStrand && m1_isReverseStrand ) return 7; + } + + // unknown model + return ModelType::DUMMY_ID; +} + +// --------------------------------------------- +// ReadGroupResolver implementation + +struct ReadGroupResolver { + + // data members + int32_t MinFragmentLength; + int32_t MedianFragmentLength; + int32_t MaxFragmentLength; + vector Models; + + static double ConfidenceInterval; + + // ctor + ReadGroupResolver(void); + + // resolving methods + bool IsValidInsertSize(const BamAlignment& al) const; + bool IsValidOrientation(const BamAlignment& al) const; + + // select 2 best models based on observed data + void DetermineTopModels(void); + + static void SetConfidenceInterval(const double& ci); +}; + +double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL; + +ReadGroupResolver::ReadGroupResolver(void) + : MinFragmentLength(0) + , MedianFragmentLength(0) + , MaxFragmentLength(0) +{ + // pre-allocate space for 8 models + Models.reserve(NUM_MODELS); + for ( int i = 0; i < NUM_MODELS; ++i ) + Models.push_back( ModelType((unsigned char)(i+1)) ); +} + +bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const { + return ( al.InsertSize >= MinFragmentLength && + al.InsertSize <= MaxFragmentLength ); +} + +bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const { + const unsigned char currentModel = CalculateModelType(al); + return ( currentModel == Models[0].ID || currentModel == Models[1].ID ); +} + +void ReadGroupResolver::DetermineTopModels(void) { + + // sort models (most common -> least common) + sort( Models.begin(), Models.end(), std::greater() ); + + // make sure that the 2 most common models are some threshold more common + // than the remaining models + const unsigned int activeModelCountSum = Models[0].size() + Models[1].size(); + const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() + + Models[4].size() + Models[5].size() + + Models[6].size() + Models[7].size(); + if ( activeModelCountSum == 0 ) return; + const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum; + if ( unusedPercentage > DEFAULT_MODEL_COUNT_THRESHOLD ) { + cerr << "ERROR: When determining whether to apply mate-pair or paired-end constraints, " + << "an irregularity in the alignment model counts was discovered." << endl + << endl; + cerr << " Normal mate-pair data sets have the highest counts for alignment models: 4 & 5." << endl; + cerr << " Normal paired-end data sets have the highest counts for alignment models: 2 & 6." << endl; + cerr << " Normal solid-end data sets have the highest counts for alignment models: 1 & 8." << endl + << endl; + cerr << " We expect that the ratio of the 6 lowest counts to the 2 highest counts to be no larger than " + << DEFAULT_MODEL_COUNT_THRESHOLD << ", but in this data set the ratio was " << unusedPercentage << endl + << endl; + for ( unsigned char i = 0; i < NUM_MODELS; ++i ) + cerr << "- alignment model " << Models[i].ID << " : " << Models[i].size() << " hits" << endl; + exit(1); + } + + // emit a warning if the best alignment models are non-standard + const bool isModel1Top = (Models[0].ID == 1) || (Models[1].ID == 1); + const bool isModel2Top = (Models[0].ID == 2) || (Models[1].ID == 2); + const bool isModel4Top = (Models[0].ID == 4) || (Models[1].ID == 4); + const bool isModel5Top = (Models[0].ID == 5) || (Models[1].ID == 5); + const bool isModel6Top = (Models[0].ID == 6) || (Models[1].ID == 6); + const bool isModel8Top = (Models[0].ID == 8) || (Models[1].ID == 8); + + bool isMatePair = ( isModel4Top && isModel5Top ? true : false ); + bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false ); + bool isSolidPair = ( isModel1Top && isModel8Top ? true : false ); + + if ( debug ) { + if ( isMatePair ) cerr << "- resolving mate-pair alignments" << endl; + else if ( isPairedEnd ) cerr << "- resolving paired-end alignments" << endl; + else if ( isSolidPair ) cerr << "- resolving solid-pair alignments" << endl; + else { + cerr << "- WARNING: Found a non-standard alignment model configuration. " + << "Using alignment models " << Models[0].ID << " & " << Models[1].ID << endl; + } + } + + // store only the fragments from the best alignment models, then sort + vector fragments; + fragments.reserve( Models[0].size() + Models[1].size() ); + fragments.insert( fragments.end(), Models[0].begin(), Models[0].end() ); + fragments.insert( fragments.end(), Models[1].begin(), Models[1].end() ); + sort ( fragments.begin(), fragments.end() ); + + // clear out Model fragment data, not needed anymore + // keep sorted though, with IDs, we'll be coming back to that + for ( int i = 0; i < NUM_MODELS; ++i ) + Models[i].clear(); + + // determine min,median, & max fragment lengths for each read group + const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0; + const unsigned int numFragmentLengths = fragments.size(); + if ( numFragmentLengths == 0 ) return; + + const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval); + MinFragmentLength = fragments[minIndex]; + + const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5); + MedianFragmentLength = fragments[medianIndex]; + + const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval)); + MaxFragmentLength = fragments[maxIndex]; +} + +void ReadGroupResolver::SetConfidenceInterval(const double& ci) { + ConfidenceInterval = ci; +} + // --------------------------------------------- // ResolveSettings implementation struct ResolveTool::ResolveSettings { // flags - bool HasInput; - bool HasOutput; + bool HasInputFile; + bool HasOutputFile; + bool HasStatsFile; + bool HasConfidenceInterval; bool IsForceCompression; // filenames string InputFilename; string OutputFilename; + string StatsFilename; + + // 'resolve options' + double ConfidenceInterval; // constructor ResolveSettings(void) - : HasInput(false) - , HasOutput(false) + : HasInputFile(false) + , HasOutputFile(false) + , HasStatsFile(false) , IsForceCompression(false) , InputFilename(Options::StandardIn()) , OutputFilename(Options::StandardOut()) + , StatsFilename("") + , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL) { } }; @@ -54,7 +273,6 @@ struct ResolveTool::ResolveToolPrivate { ResolveToolPrivate(ResolveTool::ResolveSettings* settings) : m_settings(settings) { } - ~ResolveToolPrivate(void) { } // 'public' interface @@ -63,14 +281,200 @@ struct ResolveTool::ResolveToolPrivate { // internal methods private: + bool CalculateStats(BamReader& reader); + void ParseHeader(const SamHeader& header); + bool ParseStatsFile(void); + void ResolveAlignment(BamAlignment& al); // data members private: ResolveTool::ResolveSettings* m_settings; + map m_readGroups; }; +bool ResolveTool::ResolveToolPrivate::CalculateStats(BamReader& reader) { + + // ensure that we get a fresh BamReader + reader.Rewind(); + + // read through BAM file + BamAlignment al; + string readGroup(""); + map::iterator rgIter; + while ( reader.GetNextAlignmentCore(al) ) { + + // skip if alignment is not paired, mapped, nor mate is mapped + if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() ) + continue; + + // determine model type, skip if model unknown + const unsigned char currentModelType = CalculateModelType(al); + assert( currentModelType != ModelType::DUMMY_ID ); + + // flesh out the char data, so we can retrieve its read group ID + al.BuildCharData(); + + // get read group from alignment + readGroup.clear(); + al.GetTag(READ_GROUP_TAG, readGroup); + + // look up resolver for read group + rgIter = m_readGroups.find(readGroup); + if ( rgIter == m_readGroups.end() ) { + cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: " + << readGroup << endl; + return false; + } + ReadGroupResolver& resolver = (*rgIter).second; + + // update stats for current read group, current model type + resolver.Models[currentModelType].push_back(al.InsertSize); + } + + // iterate back through read groups + map::iterator rgEnd = m_readGroups.end(); + for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) { + ReadGroupResolver& resolver = (*rgIter).second; + + // calculate acceptable orientation & insert sizes for this read group + resolver.DetermineTopModels(); + + if ( debug ) { + cerr << "----------------------------------------" << endl + << "ReadGroup: " << (*rgIter).first << endl + << "----------------------------------------" << endl + << "Min FL: " << resolver.MinFragmentLength << endl + << "Med FL: " << resolver.MedianFragmentLength << endl + << "Max FL: " << resolver.MaxFragmentLength << endl + << endl; + } + } + + // return reader to beginning & return success + reader.Rewind(); + return true; +} + +void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) { + + // iterate over header read groups, creating a 'resolver' for each + SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + m_readGroups.insert( make_pair(rg.ID, ReadGroupResolver()) ); + } +} + +bool ResolveTool::ResolveToolPrivate::ParseStatsFile(void) { + cerr << "ResolveTool::ParseStatsFile() ERROR - not yet implemented!" << endl; + return false; +} + +void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) { + + // clear proper-pair flag + al.SetIsProperPair(false); + + // quit check if alignment is not from paired-end read + if ( !al.IsPaired() ) return; + + // quit check if either alignment or mate are unmapped + if ( !al.IsMapped() || !al.IsMateMapped() ) return; + + // quit check if map quality is 0 + if ( al.MapQuality == 0 ) return; + + // get read group from alignment + // empty string if not found, this is OK - we handle a default empty read group case + string readGroup(""); + al.GetTag(READ_GROUP_TAG, readGroup); + + // look up read group's 'resolver' + map::iterator rgIter = m_readGroups.find(readGroup); + if ( rgIter == m_readGroups.end() ) { + cerr << "bamtools resolve ERROR - read group found that was not in header: " + << readGroup << endl; + exit(1); + } + const ReadGroupResolver& resolver = (*rgIter).second; + + // quit check if pairs are not in proper orientation (tech-dependent, can differ for each RG) + if ( !resolver.IsValidOrientation(al) ) return; + + // quit check if pairs are not within "reasonable" distance (differs for each RG) + if ( !resolver.IsValidInsertSize(al) ) return; + + // if we get here, alignment is OK - set 'proper pair' flag + al.SetIsProperPair(true); +} + bool ResolveTool::ResolveToolPrivate::Run(void) { - cerr << "Resoling BAM file..." << endl; + + ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval); + + // initialize read group map with default (empty name) read group + m_readGroups.insert( make_pair("", ReadGroupResolver()) ); + + // open our BAM reader + BamReader reader; + if ( !reader.Open(m_settings->InputFilename) ) { + cerr << "bamtools resolve ERROR: could not open input BAM file: " + << m_settings->InputFilename << endl; + return false; + } + + // retrieve header & reference dictionary info + const SamHeader& header = reader.GetHeader(); + const RefVector& references = reader.GetReferenceData(); + + // parse BAM header for read groups + ParseHeader(header); + + // if existing stats file provided, parse for fragment lengths + if ( m_settings->HasStatsFile ) { + if ( !ParseStatsFile() ) { + cerr << "bamtools resolve ERROR - could not parse stats file" << endl; + reader.Close(); + return false; + } + } + // otherwise calculate stats from BAM alignments + else { + if ( !CalculateStats(reader) ) { + cerr << "bamtools resolve ERROR - could not calculate stats from BAM file" << endl; + reader.Close(); + return false; + } + } + + // determine compression mode for BamWriter + bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && + !m_settings->IsForceCompression ); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; + + // open BamWriter + BamWriter writer; + writer.SetCompressionMode(compressionMode); + if ( !writer.Open(m_settings->OutputFilename, header, references) ) { + cerr << "bamtools resolve ERROR: could not open " + << m_settings->OutputFilename << " for writing." << endl; + reader.Close(); + return false; + } + + // plow through alignments, setting/clearing 'proper pair' flag + // and writing to new output BAM file + BamAlignment al; + while ( reader.GetNextAlignment(al) ) { + ResolveAlignment(al); + writer.SaveAlignment(al); + } + + // clean up & return success + reader.Close(); + writer.Close(); return true; } @@ -83,13 +487,19 @@ ResolveTool::ResolveTool(void) , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools resolve", "resolves paired-end reads (marking the IsProperPair flag as needed)", "[-in -in ...] [-out | [-forceCompression]] [resolveOptions]"); + Options::SetProgramInfo("bamtools resolve", "resolves paired-end reads (marking the IsProperPair flag as needed)", "[-in ] [-out | [-forceCompression] ] "); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFilename, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); - Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFile, m_settings->InputFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputFile, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); +// Options::AddValueOption("-stats", "STATS filename", "alignment stats file", "", m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts, ""); + Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", + m_settings->IsForceCompression, IO_Opts); + + OptionGroup* ResolveOpts = Options::CreateOptionGroup("Resolve Settings"); + Options::AddValueOption("-ci", "double", "confidence interval", + "", m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, ResolveOpts); } ResolveTool::~ResolveTool(void) { @@ -115,6 +525,8 @@ int ResolveTool::Run(int argc, char* argv[]) { m_impl = new ResolveToolPrivate(m_settings); // run ResolveTool, return success/failure - if ( m_impl->Run() ) return 0; - else return 1; + if ( m_impl->Run() ) + return 0; + else + return 1; } diff --git a/src/toolkit/bamtools_resolve.h b/src/toolkit/bamtools_resolve.h index 96bd162..20308d3 100644 --- a/src/toolkit/bamtools_resolve.h +++ b/src/toolkit/bamtools_resolve.h @@ -3,9 +3,10 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 +// Last modified: 19 April 2011 // --------------------------------------------------------------------------- -// Resolves paired-end reads (marking the IsProperPair flag as needed) +// Resolves paired-end reads (marking the IsProperPair flag as needed) in a +// BAM file. // *************************************************************************** #ifndef BAMTOOLS_RESOLVE_H