From cbd3b7633c78f91be456991e3ec751ed09ddf87e Mon Sep 17 00:00:00 2001 From: derek Date: Tue, 28 Jun 2011 12:31:25 -0400 Subject: [PATCH] Added unique-alignment checks for ResolveTool * Unique-ness determined by comparing MapQuality to 0 * Only pairs with both mates unique are used for the 'makeStats' median fragment size calculation. --- src/toolkit/bamtools_resolve.cpp | 232 ++++++++++++++++++++++++++++--- src/toolkit/bamtools_resolve.h | 4 +- 2 files changed, 212 insertions(+), 24 deletions(-) diff --git a/src/toolkit/bamtools_resolve.cpp b/src/toolkit/bamtools_resolve.cpp index 97eefcf..a873d5f 100644 --- a/src/toolkit/bamtools_resolve.cpp +++ b/src/toolkit/bamtools_resolve.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 14 June 2011 +// Last modified: 28 June 2011 // --------------------------------------------------------------------------- // Resolves paired-end reads (marking the IsProperPair flag as needed). // *************************************************************************** @@ -19,6 +19,7 @@ using namespace BamTools; #include #include #include +#include #include #include #include @@ -71,6 +72,13 @@ static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups"; static const string RG_FIELD_DESCRIPTION = "# "; +// -------------------------------------------------------------------------- +// unique readname file constants +// -------------------------------------------------------------------------- + +static const string READNAME_FILE_SUFFIX = ".uniq_names.txt"; +static const string DEFAULT_READNAME_FILE = "bt_resolve_TEMP" + READNAME_FILE_SUFFIX; + // -------------------------------------------------------------------------- // ModelType implementation @@ -146,6 +154,7 @@ struct ReadGroupResolver { bool IsAmbiguous; bool HasData; vector Models; + map ReadNames; // ctor ReadGroupResolver(void); @@ -292,6 +301,7 @@ struct ResolveTool::ResolveSettings { string InputBamFilename; string OutputBamFilename; string StatsFilename; + string ReadNamesFilename; // ** N.B. - Only used internally, not set from cmdline ** // 'resolve options' double ConfidenceInterval; @@ -312,12 +322,121 @@ struct ResolveTool::ResolveSettings { , InputBamFilename(Options::StandardIn()) , OutputBamFilename(Options::StandardOut()) , StatsFilename("") + , ReadNamesFilename(DEFAULT_READNAME_FILE) , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL) , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD) , HasForceMarkReadGroups(false) { } }; +// -------------------------------------------------------------------------- +// ReadNamesFileReader implementation + +struct ResolveTool::ReadNamesFileReader { + + // ctor & dtor + ReadNamesFileReader(void) { } + ~ReadNamesFileReader(void) { Close(); } + + // main reader interface + public: + void Close(void); + bool Open(const string& filename); + bool Read(map& readGroups); + + // data members + private: + ifstream m_stream; +}; + +void ResolveTool::ReadNamesFileReader::Close(void) { + if ( m_stream.is_open() ) + m_stream.close(); +} + +bool ResolveTool::ReadNamesFileReader::Open(const string& filename) { + + // make sure stream is fresh + Close(); + + // attempt to open filename, return status + m_stream.open(filename.c_str(), ifstream::in); + return m_stream.good(); +} + +bool ResolveTool::ReadNamesFileReader::Read(map& readGroups) { + + // up-front sanity check + if ( !m_stream.is_open() ) return false; + + // parse read names file + string line; + vector fields; + map::iterator rgIter; + map::iterator rgEnd = readGroups.end(); + while ( getline(m_stream, line) ) { + + // skip if empty line + if ( line.empty() ) continue; + + // split line on '\t' + fields = Utilities::Split(line, TAB_CHAR); + if ( fields.size() != 2 ) continue; + + // look up resolver for read group + rgIter = readGroups.find( fields[0] ); + if ( rgIter == rgEnd ) return false; + ReadGroupResolver& resolver = (*rgIter).second; + + // store read name with resolver + resolver.ReadNames.insert( make_pair(fields[1], true) ) ; + } + + // if here, return success + return true; +} + +// -------------------------------------------------------------------------- +// ReadNamesFileWriter implementation + +struct ResolveTool::ReadNamesFileWriter { + + // ctor & dtor + ReadNamesFileWriter(void) { } + ~ReadNamesFileWriter(void) { Close(); } + + // main reader interface + public: + void Close(void); + bool Open(const string& filename); + void Write(const string& readGroupName, const string& readName); + + // data members + private: + ofstream m_stream; +}; + +void ResolveTool::ReadNamesFileWriter::Close(void) { + if ( m_stream.is_open() ) + m_stream.close(); +} + +bool ResolveTool::ReadNamesFileWriter::Open(const string& filename) { + + // make sure stream is fresh + Close(); + + // attempt to open filename, return status + m_stream.open(filename.c_str(), ofstream::out); + return m_stream.good(); +} + +void ResolveTool::ReadNamesFileWriter::Write(const string& readGroupName, + const string& readName) +{ + m_stream << readGroupName << TAB_CHAR << readName << endl; +} + // -------------------------------------------------------------------------- // StatsFileReader implementation @@ -331,7 +450,7 @@ struct ResolveTool::StatsFileReader { // main reader interface public: void Close(void); - bool Open(const std::string& filename); + bool Open(const string& filename); bool Read(ResolveTool::ResolveSettings* settings, map& readGroups); @@ -538,7 +657,7 @@ struct ResolveTool::StatsFileWriter { // main reader interface public: void Close(void); - bool Open(const std::string& filename); + bool Open(const string& filename); bool Write(ResolveTool::ResolveSettings* settings, const map& readGroups); @@ -779,22 +898,32 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) { ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold); // open our BAM reader - BamReader reader; - if ( !reader.Open(m_settings->InputBamFilename) ) { + BamReader bamReader; + if ( !bamReader.Open(m_settings->InputBamFilename) ) { cerr << "bamtools resolve ERROR: could not open input BAM file: " << m_settings->InputBamFilename << endl; return false; } // retrieve header & parse for read groups - const SamHeader& header = reader.GetHeader(); + const SamHeader& header = bamReader.GetHeader(); ParseHeader(header); + // open ReadNamesFileWriter + ResolveTool::ReadNamesFileWriter readNamesWriter; + if ( !readNamesWriter.Open(m_settings->ReadNamesFilename) ) { + cerr << "bamtools resolve ERROR: could not open (temp) output read names file: " + << m_settings->ReadNamesFilename << endl; + bamReader.Close(); + return false; + } + // read through BAM file BamAlignment al; string readGroup(""); map::iterator rgIter; - while ( reader.GetNextAlignmentCore(al) ) { + map::iterator readNameIter; + while ( bamReader.GetNextAlignmentCore(al) ) { // skip if alignment is not paired, mapped, nor mate is mapped if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() ) @@ -803,17 +932,6 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) { // skip if alignment & mate not on same reference sequence if ( al.RefID != al.MateRefID ) continue; - // skip if map quality is 0 - if ( al.MapQuality == 0 ) continue; - - // skip if insert size is less than (we'll count its mate) - // or equal to zero (single-end or missing data) - if ( al.InsertSize <= 0 ) continue; - - // determine model type, skip if model unknown - const uint16_t currentModelType = CalculateModelType(al); - assert( currentModelType != ModelType::DUMMY_ID ); - // flesh out the char data, so we can retrieve its read group ID al.BuildCharData(); @@ -826,15 +944,45 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) { if ( rgIter == m_readGroups.end() ) { cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: " << readGroup << endl; - reader.Close(); + bamReader.Close(); return false; } ReadGroupResolver& resolver = (*rgIter).second; - // update stats for current read group, current model type - resolver.Models[currentModelType].push_back(al.InsertSize); + // determine unique-ness of current alignment + const bool isCurrentMateUnique = ( al.MapQuality != 0 ); + + // look up read name + readNameIter = resolver.ReadNames.find(al.Name); + + // if read name found (current alignment's mate already parsed) + if ( readNameIter != resolver.ReadNames.end() ) { + + // if both unique mates are unique, store read name & insert size for later + const bool isStoredMateUnique = (*readNameIter).second; + if ( isCurrentMateUnique && isStoredMateUnique ) { + + // save read name in temp file as candidates for later pair marking + readNamesWriter.Write(readGroup, al.Name); + + // determine model type & store fragment length for stats calculation + const uint16_t currentModelType = CalculateModelType(al); + assert( currentModelType != ModelType::DUMMY_ID ); + resolver.Models[currentModelType].push_back( abs(al.InsertSize) ); + } + + // unique or not, remove read name from map + resolver.ReadNames.erase(readNameIter); + } + + // if read name not found, store new entry + else resolver.ReadNames.insert( make_pair(al.Name, isCurrentMateUnique) ); } + // close files + readNamesWriter.Close(); + bamReader.Close(); + // iterate back through read groups map::iterator rgEnd = m_readGroups.end(); for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) { @@ -843,10 +991,13 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) { // calculate acceptable orientation & insert sizes for this read group resolver.DetermineTopModels(name); + + // clear out left over read names + // (these have mates that did not pass filters or were already removed as non-unique) + resolver.ReadNames.clear(); } - // close reader & return success - reader.Close(); + // if we get here, return success return true; } @@ -923,12 +1074,41 @@ void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) { // quit check if pairs are not within "reasonable" distance (can differ for each RG) if ( !resolver.IsValidInsertSize(al) ) return; + // quit check if alignment is not a "candidate proper pair" + map::const_iterator readNameIter; + readNameIter = resolver.ReadNames.find(al.Name); + if ( readNameIter == resolver.ReadNames.end() ) + return; + // if we get here, alignment is OK - set 'proper pair' flag al.SetIsProperPair(true); } bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) { + // open file containing read names of candidate proper pairs + ResolveTool::ReadNamesFileReader readNamesReader; + if ( !readNamesReader.Open(m_settings->ReadNamesFilename) ) { + cerr << "bamtools resolve ERROR: could not open (temp) inputput read names file: " + << m_settings->ReadNamesFilename << endl; + return false; + } + + // parse read names (matching with corresponding read groups) + if ( !readNamesReader.Read(m_readGroups) ) { + cerr << "bamtools resolve ERROR: could not read candidate read names from file: " + << m_settings->ReadNamesFilename << endl; + readNamesReader.Close(); + return false; + } + + // close read name file reader & delete temp file + readNamesReader.Close(); + if ( remove(m_settings->ReadNamesFilename.c_str()) != 0 ) { + cerr << "bamtools resolve WARNING: could not delete temp file: " + << m_settings->ReadNamesFilename << endl; + } + // open our BAM reader BamReader reader; if ( !reader.Open(m_settings->InputBamFilename) ) { @@ -987,6 +1167,12 @@ bool ResolveTool::ResolveToolPrivate::Run(void) { // initialize read group map with default (empty name) read group m_readGroups.insert( make_pair("", ReadGroupResolver()) ); + // init readname filename + // uses (adjusted) stats filename if provided (req'd for makeStats, markPairs modes; optional for twoPass) + // else keep default filename + if ( m_settings->HasStatsFile ) + m_settings->ReadNamesFilename = m_settings->StatsFilename + READNAME_FILE_SUFFIX; + // -makeStats mode if ( m_settings->IsMakeStats ) { diff --git a/src/toolkit/bamtools_resolve.h b/src/toolkit/bamtools_resolve.h index 109a89f..681a557 100644 --- a/src/toolkit/bamtools_resolve.h +++ b/src/toolkit/bamtools_resolve.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 June 2011 +// Last modified: 23 June 2011 // --------------------------------------------------------------------------- // Resolves paired-end reads (marking the IsProperPair flag as needed). // *************************************************************************** @@ -32,6 +32,8 @@ class ResolveTool : public AbstractTool { struct ResolveToolPrivate; ResolveToolPrivate* m_impl; + struct ReadNamesFileReader; + struct ReadNamesFileWriter; struct StatsFileReader; struct StatsFileWriter; }; -- 2.39.2