From: Derek Date: Thu, 16 Sep 2010 05:37:28 +0000 (-0400) Subject: Added new PileupEngine to the toolkit. This is used by CoverageTool as well as Conver... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=5f5d20647f918069e0b6dbc055ea184c37bd1196;p=bamtools.git Added new PileupEngine to the toolkit. This is used by CoverageTool as well as ConvertTool for pileup format. Pileup conversion output before was buggy and overall incorrect. Now should match SAMtools output to the best of my knowledge --- diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp index edfc83c..1929da6 100644 --- a/src/toolkit/bamtools_convert.cpp +++ b/src/toolkit/bamtools_convert.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 July 2010 +// Last modified: 16 September 2010 // --------------------------------------------------------------------------- // Converts between BAM and a number of other formats // *************************************************************************** @@ -13,21 +13,22 @@ #include #include #include - #include "bamtools_convert.h" +#include "bamtools_fasta.h" #include "bamtools_options.h" -#include "bamtools_pileup.h" +#include "bamtools_pileup_engine.h" #include "bamtools_utilities.h" #include "BGZF.h" -#include "BamReader.h" #include "BamMultiReader.h" - using namespace std; using namespace BamTools; namespace BamTools { - - // format names + + // --------------------------------------------- + // ConvertTool constants + + // supported conversion format command-line names static const string FORMAT_BED = "bed"; static const string FORMAT_BEDGRAPH = "bedgraph"; static const string FORMAT_FASTA = "fasta"; @@ -40,36 +41,36 @@ namespace BamTools { // other constants static const unsigned int FASTA_LINE_MAX = 50; -} // namespace BamTools - -struct ConvertTool::ConvertToolPrivate { - - // ctor & dtor - public: - ConvertToolPrivate(ConvertTool::ConvertSettings* settings); - ~ConvertToolPrivate(void); + // --------------------------------------------- + // ConvertPileupFormatVisitor declaration - // interface - public: - bool Run(void); + class ConvertPileupFormatVisitor : public PileupVisitor { + + // ctor & dtor + public: + ConvertPileupFormatVisitor(const RefVector& references, + const string& fastaFilename, + const bool isPrintingMapQualities, + ostream* out); + ~ConvertPileupFormatVisitor(void); + + // PileupVisitor interface implementation + public: +// void Visit(const int& refId, const int& position, const vector& alignments); + + void Visit(const PileupPosition& pileupData); + + // data members + private: + Fasta m_fasta; + bool m_hasFasta; + bool m_isPrintingMapQualities; + ostream* m_out; + RefVector m_references; + }; - // internal methods - private: - void PrintBed(const BamAlignment& a); - void PrintBedGraph(const BamAlignment& a); - void PrintFasta(const BamAlignment& a); - void PrintFastq(const BamAlignment& a); - void PrintJson(const BamAlignment& a); - void PrintSam(const BamAlignment& a); - void PrintWiggle(const BamAlignment& a); - - // data members - private: - ConvertTool::ConvertSettings* m_settings; - RefVector m_references; - ostream m_out; -}; - +} // namespace BamTools + // --------------------------------------------- // ConvertSettings implementation @@ -105,66 +106,43 @@ struct ConvertTool::ConvertSettings { , IsOmittingSamHeader(false) , IsPrintingPileupMapQualities(false) , OutputFilename(Options::StandardOut()) + , FastaFilename("") { } -}; +}; // --------------------------------------------- -// ConvertTool implementation - -ConvertTool::ConvertTool(void) - : AbstractTool() - , m_settings(new ConvertSettings) - , m_impl(0) -{ - // set program details - Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format [-in -in ...] [-out ] [other options]"); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); - Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts); - - OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); - Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); - - OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options"); - Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, ""); - Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts); - - OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options"); - Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts); -} - -ConvertTool::~ConvertTool(void) { - delete m_settings; - m_settings = 0; - - delete m_impl; - m_impl = 0; -} - -int ConvertTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int ConvertTool::Run(int argc, char* argv[]) { +// ConvertToolPrivate implementation - // parse command line arguments - Options::Parse(argc, argv, 1); - - // run internal ConvertTool implementation, return success/fail - m_impl = new ConvertToolPrivate(m_settings); +struct ConvertTool::ConvertToolPrivate { + + // ctor & dtor + public: + ConvertToolPrivate(ConvertTool::ConvertSettings* settings); + ~ConvertToolPrivate(void); - if ( m_impl->Run() ) - return 0; - else - return 1; -} - -// --------------------------------------------- -// ConvertToolPrivate implementation + // interface + public: + bool Run(void); + + // internal methods + private: + void PrintBed(const BamAlignment& a); + void PrintBedGraph(const BamAlignment& a); + void PrintFasta(const BamAlignment& a); + void PrintFastq(const BamAlignment& a); + void PrintJson(const BamAlignment& a); + void PrintSam(const BamAlignment& a); + void PrintWiggle(const BamAlignment& a); + + // special case - uses the PileupEngine + bool RunPileupConversion(BamMultiReader* reader); + + // data members + private: + ConvertTool::ConvertSettings* m_settings; + RefVector m_references; + ostream m_out; +}; ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings) : m_settings(settings) @@ -175,8 +153,6 @@ ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { } bool ConvertTool::ConvertToolPrivate::Run(void) { - bool convertedOk = true; - // ------------------------------------ // initialize conversion input/output @@ -193,8 +169,13 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { BamRegion region; if ( m_settings->HasRegion ) { if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - if ( !reader.SetRegion(region) ) - cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl; + if ( !reader.SetRegion(region) ) { + cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl; + return false; + } + } else { + cerr << "Could not parse REGION: " << m_settings->Region << endl; + return false; } } @@ -213,33 +194,22 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { m_out.rdbuf(outFile.rdbuf()); } - // ------------------------ + // ------------------------------------- + // do conversion based on format + + bool convertedOk = true; + // pileup is special case - if ( m_settings->Format == FORMAT_PILEUP ) { - - // initialize pileup input/output - Pileup pileup(&reader, &m_out); - - // --------------------------- - // configure pileup settings - - if ( m_settings->HasRegion ) - pileup.SetRegion(region); - - if ( m_settings->HasFastaFilename ) - pileup.SetFastaFilename(m_settings->FastaFilename); - - pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities ); - - // run pileup - convertedOk = pileup.Run(); - } + // conversion not done per alignment, like the other formats + if ( m_settings->Format == FORMAT_PILEUP ) + convertedOk = RunPileupConversion(&reader); - // ------------------------------------- - // else determine 'simpler' format type + // all other formats else { bool formatError = false; + + // set function pointer to proper conversion method void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0; if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph; @@ -255,24 +225,26 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { convertedOk = false; } - // if SAM format & not omitting header, print SAM header - if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) { - string headerText = reader.GetHeaderText(); - m_out << headerText; - } - - // ------------------------ - // do conversion + // if format selected ok if ( !formatError ) { + + // if SAM format & not omitting header, print SAM header first + if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) + m_out << reader.GetHeaderText(); + + // iterate through file, doing conversion BamAlignment a; - while ( reader.GetNextAlignment(a) ) { - (this->*pFunction)(a); - } + while ( reader.GetNextAlignment(a) ) + (this->*pFunction)(a); + + // set flag for successful conversion + convertedOk = true; } } // ------------------------ // clean up & exit + reader.Close(); if ( m_settings->HasOutput ) outFile.close(); return convertedOk; @@ -282,7 +254,7 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { // Conversion/output methods // ---------------------------------------------------------- -void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) { +void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) { // tab-delimited, 0-based half-open // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) ) @@ -485,7 +457,6 @@ void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { } m_out << "}" << endl; - } // print BamAlignment in SAM format @@ -617,3 +588,236 @@ void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) { ; } + +bool ConvertTool::ConvertToolPrivate::RunPileupConversion(BamMultiReader* reader) { + + // check for valid BamMultiReader + if ( reader == 0 ) return false; + + // set up our pileup format 'visitor' + ConvertPileupFormatVisitor* v = new ConvertPileupFormatVisitor(m_references, + m_settings->FastaFilename, + m_settings->IsPrintingPileupMapQualities, + &m_out); + + // set up PileupEngine + PileupEngine pileup; + pileup.AddVisitor(v); + + // iterate through data + BamAlignment al; + while ( reader->GetNextAlignment(al) ) + pileup.AddAlignment(al); + pileup.Flush(); + + // clean up + delete v; + v = 0; + + // return success + return true; +} + +// --------------------------------------------- +// ConvertTool implementation + +ConvertTool::ConvertTool(void) + : AbstractTool() + , m_settings(new ConvertSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format [-in -in ...] [-out ] [other options]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts); + + OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); + Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); + + OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options"); + Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, ""); + Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts); + + OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options"); + Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts); +} + +ConvertTool::~ConvertTool(void) { + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int ConvertTool::Help(void) { + Options::DisplayHelp(); + return 0; +} + +int ConvertTool::Run(int argc, char* argv[]) { + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // run internal ConvertTool implementation, return success/fail + m_impl = new ConvertToolPrivate(m_settings); + + if ( m_impl->Run() ) + return 0; + else + return 1; +} + +// --------------------------------------------- +// ConvertPileupFormatVisitor implementation + +ConvertPileupFormatVisitor::ConvertPileupFormatVisitor(const RefVector& references, + const string& fastaFilename, + const bool isPrintingMapQualities, + ostream* out) + : PileupVisitor() + , m_hasFasta(false) + , m_isPrintingMapQualities(isPrintingMapQualities) + , m_out(out) + , m_references(references) +{ + // set up Fasta reader if file is provided + if ( !fastaFilename.empty() ) { + + // check for FASTA index + string indexFilename = ""; + if ( Utilities::FileExists(fastaFilename + ".fai") ) + indexFilename = fastaFilename + ".fai"; + + // open FASTA file + if ( m_fasta.Open(fastaFilename, indexFilename) ) + m_hasFasta = true; + } +} + +ConvertPileupFormatVisitor::~ConvertPileupFormatVisitor(void) { + // be sure to close Fasta reader + if ( m_hasFasta ) { + m_fasta.Close(); + m_hasFasta = false; + } +} + +void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData ) { + + // skip if no alignments at this position + if ( pileupData.PileupAlignments.empty() ) return; + + // retrieve reference name + const string& referenceName = m_references[pileupData.RefId].RefName; + const int& position = pileupData.Position; + + // retrieve reference base from FASTA file, if one provided; otherwise default to 'N' + char referenceBase('N'); + if ( m_hasFasta && (pileupData.Position < m_references[pileupData.RefId].RefLength) ) { + if ( !m_fasta.GetBase(pileupData.RefId, pileupData.Position, referenceBase ) ) { + cerr << "Pileup error : Could not read reference base from FASTA file" << endl; + return; + } + } + + // get count of alleles at this position + const int numberAlleles = pileupData.PileupAlignments.size(); + + // ----------------------------------------------------------- + // build strings based on alleles at this positionInAlignment + + stringstream bases(""); + stringstream baseQualities(""); + stringstream mapQualities(""); + + // iterate over alignments at this pileup position + vector::const_iterator pileupIter = pileupData.PileupAlignments.begin(); + vector::const_iterator pileupEnd = pileupData.PileupAlignments.end(); + for ( ; pileupIter != pileupEnd; ++pileupIter ) { + const PileupAlignment pa = (*pileupIter); + const BamAlignment& ba = pa.Alignment; + + // if beginning of read segment + if ( pa.IsSegmentBegin ) + bases << '^' << ( ((int)ba.MapQuality > 93) ? (char)126 : (char)((int)ba.MapQuality+33) ); + + // if current base is not a DELETION + if ( !pa.IsCurrentDeletion ) { + + // get base at current position + char base = ba.QueryBases.at(pa.PositionInAlignment); + + // if base matches reference + if ( base == '=' || + toupper(base) == toupper(referenceBase) || + tolower(base) == tolower(referenceBase) ) + { + base = (ba.IsReverseStrand() ? ',' : '.' ); + } + + // mismatches reference + else base = (ba.IsReverseStrand() ? tolower(base) : toupper(base) ); + + // store base + bases << base; + + // if next position contains insertion + if ( pa.IsNextInsertion ) { + bases << '+' << pa.InsertionLength; + for (int i = 1; i <= pa.InsertionLength; ++i) { + char insertedBase = (char)ba.QueryBases.at(pa.PositionInAlignment + i); + bases << (ba.IsReverseStrand() ? (char)tolower(insertedBase) : (char)toupper(insertedBase) ); + } + } + + // if next position contains DELETION + else if ( pa.IsNextDeletion ) { + bases << '-' << pa.DeletionLength; + for (int i = 1; i <= pa.DeletionLength; ++i) { + char deletedBase('N'); + if ( m_hasFasta && (pileupData.Position+i < m_references[pileupData.RefId].RefLength) ) { + if ( !m_fasta.GetBase(pileupData.RefId, pileupData.Position+i, deletedBase ) ) { + cerr << "Pileup error : Could not read reference base from FASTA file" << endl; + return; + } + } + bases << (ba.IsReverseStrand() ? (char)tolower(deletedBase) : (char)toupper(deletedBase) ); + } + } + } + + // otherwise, DELETION + else bases << '*'; + + // if end of read segment + if ( pa.IsSegmentEnd ) bases << '$'; + + // store current base quality + baseQualities << ba.Qualities.at(pa.PositionInAlignment); + + // save alignment map quality if desired + if ( m_isPrintingMapQualities ) + mapQualities << ( ((int)ba.MapQuality > 93) ? (char)126 : (char)((int)ba.MapQuality+33) ); + } + + // ---------------------- + // print results + + // tab-delimited + // <1-based pos> [mapQuals] + + const string TAB = "\t"; + *m_out << referenceName << TAB + << position + 1 << TAB + << referenceBase << TAB + << numberAlleles << TAB + << bases.str() << TAB + << baseQualities.str() << TAB + << mapQualities.str() << endl; +} diff --git a/src/toolkit/bamtools_coverage.cpp b/src/toolkit/bamtools_coverage.cpp index b587445..5924edb 100644 --- a/src/toolkit/bamtools_coverage.cpp +++ b/src/toolkit/bamtools_coverage.cpp @@ -3,61 +3,174 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 16 September 2010 // --------------------------------------------------------------------------- -// Prints coverage statistics for a single BAM file -// -// ** Expand to multiple?? -// +// Prints coverage data for a single BAM file // *************************************************************************** #include +#include #include #include - #include "bamtools_coverage.h" #include "bamtools_options.h" +#include "bamtools_pileup_engine.h" #include "BamReader.h" - using namespace std; using namespace BamTools; +namespace BamTools { + +// --------------------------------------------- +// CoverageVisitor implementation + +class CoverageVisitor : public PileupVisitor { + + public: + CoverageVisitor(const RefVector& references, ostream* out) + : PileupVisitor() + , m_references(references) + , m_out(out) + { } + ~CoverageVisitor(void) { } + + // PileupVisitor interface implementation + public: +// void Visit(const int& refId, const int& position, const vector& alignments) { + void Visit(const PileupPosition& pileupData) { + // ----------------------------------------- + // print coverage results ( tab-delimited ) + // <0-based pos> +// *m_out << m_references[refId].RefName << "\t" << position << "\t" << alignments.size() << endl; + *m_out << m_references[pileupData.RefId].RefName << "\t" + << pileupData.Position << "\t" + << pileupData.PileupAlignments.size() << endl; + } + + private: + RefVector m_references; + ostream* m_out; +}; + +} // namespace BamTools + // --------------------------------------------- // CoverageSettings implementation struct CoverageTool::CoverageSettings { // flags - bool HasInputBamFilename; + bool HasInputFile; + bool HasOutputFile; // filenames - std::string InputBamFilename; + string InputBamFilename; + string OutputFilename; // constructor CoverageSettings(void) - : HasInputBamFilename(false) + : HasInputFile(false) + , HasOutputFile(false) , InputBamFilename(Options::StandardIn()) + , OutputFilename(Options::StandardOut()) { } }; +// --------------------------------------------- +// CoverageToolPrivate implementation + +struct CoverageTool::CoverageToolPrivate { + + // ctor & dtor + public: + CoverageToolPrivate(CoverageTool::CoverageSettings* settings); + ~CoverageToolPrivate(void); + + // interface + public: + bool Run(void); + + // data members + private: + CoverageTool::CoverageSettings* m_settings; + ostream m_out; + RefVector m_references; +}; + +CoverageTool::CoverageToolPrivate::CoverageToolPrivate(CoverageTool::CoverageSettings* settings) + : m_settings(settings) + , m_out(cout.rdbuf()) // default output to cout +{ } + +CoverageTool::CoverageToolPrivate::~CoverageToolPrivate(void) { } + +bool CoverageTool::CoverageToolPrivate::Run(void) { + + // if output filename given + ofstream outFile; + if ( m_settings->HasOutputFile ) { + + // open output file stream + outFile.open(m_settings->OutputFilename.c_str()); + if ( !outFile ) { + cerr << "Could not open " << m_settings->OutputFilename << " for output." << endl; + return false; + } + + // set m_out to file's streambuf + m_out.rdbuf(outFile.rdbuf()); + } + + //open our BAM reader + BamReader reader; + reader.Open(m_settings->InputBamFilename); + m_references = reader.GetReferenceData(); + + // set up our output 'visitor' + CoverageVisitor* cv = new CoverageVisitor(m_references, &m_out); + + // set up pileup engine with 'visitor' + PileupEngine pileup; + pileup.AddVisitor(cv); + + // process input data + BamAlignment al; + while ( reader.GetNextAlignment(al) ) + pileup.AddAlignment(al); + + // clean up + reader.Close(); + if ( m_settings->HasOutputFile ) outFile.close(); + delete cv; + cv = 0; + + // return success + return true; +} + // --------------------------------------------- // CoverageTool implementation CoverageTool::CoverageTool(void) : AbstractTool() , m_settings(new CoverageSettings) + , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools coverage", "prints coverage stats for a BAM file", "-in "); + Options::SetProgramInfo("bamtools coverage", "prints coverage data for a single BAM file", "[-in ] [-out ]"); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFile, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "filename", "the output file", "", m_settings->HasOutputFile, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); } CoverageTool::~CoverageTool(void) { delete m_settings; m_settings = 0; + + delete m_impl; + m_impl = 0; } int CoverageTool::Help(void) { @@ -69,16 +182,12 @@ int CoverageTool::Run(int argc, char* argv[]) { // parse command line arguments Options::Parse(argc, argv, 1); - - //open our BAM reader - BamReader reader; - reader.Open(m_settings->InputBamFilename); - - // generate coverage stats - cerr << "Generating coverage stats for " << m_settings->InputBamFilename << endl; - cerr << "FEATURE NOT YET IMPLEMENTED!" << endl; - - // clean & exit - reader.Close(); - return 0; -} \ No newline at end of file + + // run internal ConvertTool implementation, return success/fail + m_impl = new CoverageToolPrivate(m_settings); + + if ( m_impl->Run() ) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_coverage.h b/src/toolkit/bamtools_coverage.h index c3714c0..e9e8a71 100644 --- a/src/toolkit/bamtools_coverage.h +++ b/src/toolkit/bamtools_coverage.h @@ -3,12 +3,9 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 1 August 2010 // --------------------------------------------------------------------------- -// Prints coverage statistics for a single BAM file -// -// ** Expand to multiple?? -// +// Prints coverage data for a single BAM file // *************************************************************************** #ifndef BAMTOOLS_COVERAGE_H @@ -31,6 +28,9 @@ class CoverageTool : public AbstractTool { private: struct CoverageSettings; CoverageSettings* m_settings; + + struct CoverageToolPrivate; + CoverageToolPrivate* m_impl; }; } // namespace BamTools diff --git a/src/utils/Makefile b/src/utils/Makefile index 7e13cdd..d56bfe0 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -17,7 +17,7 @@ INCLUDES = -I$(API_DIR)/ SOURCES = bamtools_fasta.cpp \ bamtools_filter_engine.cpp \ bamtools_options.cpp \ - bamtools_pileup.cpp \ + bamtools_pileup_engine.cpp \ bamtools_utilities.cpp OBJECTS= $(SOURCES:.cpp=.o) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) diff --git a/src/utils/bamtools_pileup.cpp b/src/utils/bamtools_pileup.cpp deleted file mode 100644 index 1862289..0000000 --- a/src/utils/bamtools_pileup.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// *************************************************************************** -// bamtools_pileup.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 13 July 2010 -// --------------------------------------------------------------------------- -// Provides pileup conversion functionality. -// -// The 'assembly' aspect of pileup makes this more complicated than the -// simpler one-to-one conversion methods for other formats. -// *************************************************************************** - -#include -#include "BamMultiReader.h" -#include "bamtools_pileup.h" -using namespace std; -using namespace BamTools; - -struct Pileup::PileupPrivate { - - // --------------------- - // data members - - // IO & settings - BamMultiReader* Reader; - ostream* OutStream; - string FastaFilename; - bool IsPrintingMapQualities; - BamRegion Region; - - // parsing data - int CurrentId; - int CurrentPosition; - vector CurrentData; - RefVector References; - - // ---------------------- - // ctor - - PileupPrivate(BamMultiReader* reader, ostream* outStream) - : Reader(reader) - , OutStream(outStream) - , FastaFilename("") - , IsPrintingMapQualities(false) - { } - - // ---------------------- - // internal methods - - void PrintCurrentData(void); - bool Run(void); -}; - -void Pileup::PileupPrivate::PrintCurrentData(void) { - - // remove any data that ends before CurrentPosition - size_t i = 0; - while ( i < CurrentData.size() ) { - if ( CurrentData[i].GetEndPosition() < CurrentPosition ) - CurrentData.erase(CurrentData.begin() + i); - else - ++i; - } - - // if not data remains, return - if ( CurrentData.empty() ) return; - - // initialize empty strings - string bases = ""; - string baseQuals = ""; - string mapQuals = ""; - - // iterate over alignments - vector::const_iterator dataIter = CurrentData.begin(); - vector::const_iterator dataEnd = CurrentData.end(); - for ( ; dataIter != dataEnd; ++dataIter ) { - - // retrieve alignment - const BamAlignment& al = (*dataIter); - - // determine current base character & store - const char base = al.AlignedBases[CurrentPosition -al.Position]; - if ( al.IsReverseStrand() ) - bases.push_back( tolower(base) ); - else - bases.push_back( toupper(base) ); - - // determine current base quality & store - baseQuals.push_back( al.Qualities[CurrentPosition - al.Position] ); - - // if using mapQuals, determine current mapQual & store - if ( IsPrintingMapQualities ) { - int mapQuality = (int)(al.MapQuality + 33); - if ( mapQuality > 126 ) mapQuality = 126; - mapQuals.push_back((char)mapQuality); - } - } - - // print results to OutStream - const string& refName = References[CurrentId].RefName; - const char refBase = 'N'; - - *OutStream << refName << "\t" << CurrentPosition << "\t" << refBase << "\t" << CurrentData.size() << "\t" << bases << "\t" << baseQuals; - if ( IsPrintingMapQualities ) *OutStream << "\t" << mapQuals; - *OutStream << endl; -} - -bool Pileup::PileupPrivate::Run(void) { - - // ----------------------------- - // validate input & output - - if ( !Reader ) { - cerr << "Pileup::Run() : Invalid multireader" << endl; - return false; - } - - if ( !OutStream) { - cerr << "Pileup::Run() : Invalid output stream" << endl; - return false; - } - - References = Reader->GetReferenceData(); - - // ----------------------------- - // process input data - - // get first entry - BamAlignment al; - if ( !Reader->GetNextAlignment(al) ) { - cerr << "Pileup::Run() : Could not read from multireader" << endl; - return false; - } - - // set initial markers & store first entry - CurrentId = al.RefID; - CurrentPosition = al.Position; - CurrentData.clear(); - CurrentData.push_back(al); - - // iterate over remaining data - while ( Reader->GetNextAlignment(al) ) { - - // if same reference - if ( al.RefID == CurrentId ) { - - // if same position, store and move on - if ( al.Position == CurrentPosition ) - CurrentData.push_back(al); - - // if less than CurrentPosition - sorting error => ABORT - else if ( al.Position < CurrentPosition ) { - cerr << "Pileup::Run() : Data not sorted correctly!" << endl; - return false; - } - - // else print pileup data until 'catching up' to CurrentPosition - else { - while ( al.Position > CurrentPosition ) { - PrintCurrentData(); - ++CurrentPosition; - } - CurrentData.push_back(al); - } - } - - // if reference ID less than CurrentID - sorting error => ABORT - else if ( al.RefID < CurrentId ) { - cerr << "Pileup::Run() : Data not sorted correctly!" << endl; - return false; - } - - // else moved forward onto next reference - else { - - // print any remaining pileup data from previous reference - while ( !CurrentData.empty() ) { - PrintCurrentData(); - ++CurrentPosition; - } - - // store first entry on this new reference, update markers - CurrentData.clear(); - CurrentData.push_back(al); - CurrentId = al.RefID; - CurrentPosition = al.Position; - } - } - - // ------------------------------------ - // handle any remaining data entries - - while ( !CurrentData.empty() ) { - PrintCurrentData(); - ++CurrentPosition; - } - - // ------------------------- - // return success - - return true; -} - -// ---------------------------------------------------------- -// Pileup implementation - -Pileup::Pileup(BamMultiReader* reader, ostream* outStream) { - d = new PileupPrivate(reader, outStream); -} - -Pileup::~Pileup(void) { - delete d; - d = 0; -} - -bool Pileup::Run(void) { - return d->Run(); -} - -void Pileup::SetFastaFilename(const string& filename) { - d->FastaFilename = filename; -} - -void Pileup::SetIsPrintingMapQualities(bool ok) { - d->IsPrintingMapQualities = ok; -} - -void Pileup::SetRegion(const BamRegion& region) { - d->Region = region; -} diff --git a/src/utils/bamtools_pileup.h b/src/utils/bamtools_pileup.h deleted file mode 100644 index 3ffb030..0000000 --- a/src/utils/bamtools_pileup.h +++ /dev/null @@ -1,44 +0,0 @@ -// *************************************************************************** -// bamtools_pileup.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 13 July 2010 -// --------------------------------------------------------------------------- -// Provides pileup conversion functionality. -// -// The 'assembly' aspect of pileup makes this more complicated than the -// simpler one-to-one conversion methods for other formats. -// *************************************************************************** - -#ifndef BAMTOOLS_PILEUP_H -#define BAMTOOLS_PILEUP_H - -#include -#include - -namespace BamTools { - -class BamMultiReader; -class BamRegion; - -class Pileup { - - public: - Pileup(BamMultiReader* reader, std::ostream* outStream); - ~Pileup(void); - - public: - bool Run(void); - void SetFastaFilename(const std::string& filename); - void SetIsPrintingMapQualities(bool ok); - void SetRegion(const BamRegion& region); - - private: - struct PileupPrivate; - PileupPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_PILEUP_H \ No newline at end of file diff --git a/src/utils/bamtools_pileup_engine.cpp b/src/utils/bamtools_pileup_engine.cpp new file mode 100644 index 0000000..70bace8 --- /dev/null +++ b/src/utils/bamtools_pileup_engine.cpp @@ -0,0 +1,327 @@ +// *************************************************************************** +// bamtools_pileup_engine.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 16 September 2010 +// --------------------------------------------------------------------------- +// Provides pileup at position functionality for various tools. +// *************************************************************************** + +#include +#include "bamtools_pileup_engine.h" +using namespace std; +using namespace BamTools; + +// --------------------------------------------- +// PileupEnginePrivate implementation + +struct PileupEngine::PileupEnginePrivate { + + // data members + int CurrentId; + int CurrentPosition; + vector CurrentAlignments; + PileupPosition CurrentPileupData; + + bool IsFirstAlignment; + vector Visitors; + + // ctor & dtor + PileupEnginePrivate(void) + : CurrentId(-1) + , CurrentPosition(-1) + , IsFirstAlignment(true) + { } + ~PileupEnginePrivate(void) { } + + // 'public' methods + bool AddAlignment(const BamAlignment& al); + void Flush(void); + + // internal methods + private: + void ApplyVisitors(void); + void ClearOldData(void); + void CreatePileupData(void); + void ParseAlignmentCigar(const BamAlignment& al); +}; + +bool PileupEngine::PileupEnginePrivate::AddAlignment(const BamAlignment& al) { + + // if first time + if ( IsFirstAlignment ) { + + // set initial markers + CurrentId = al.RefID; + CurrentPosition = al.Position; + + // store first entry + CurrentAlignments.clear(); + CurrentAlignments.push_back(al); + + // set flag & return + IsFirstAlignment = false; + return true; + } + + // if same reference + if ( al.RefID == CurrentId ) { + + // if same position, store and move on + if ( al.Position == CurrentPosition ) + CurrentAlignments.push_back(al); + + // if less than CurrentPosition - sorting error => ABORT + else if ( al.Position < CurrentPosition ) { + cerr << "Pileup::Run() : Data not sorted correctly!" << endl; + return false; + } + + // else print pileup data until 'catching up' to CurrentPosition + else { + while ( al.Position > CurrentPosition ) { + ApplyVisitors(); + ++CurrentPosition; + } + CurrentAlignments.push_back(al); + } + } + + // if reference ID less than CurrentId - sorting error => ABORT + else if ( al.RefID < CurrentId ) { + cerr << "Pileup::Run() : Data not sorted correctly!" << endl; + return false; + } + + // else moved forward onto next reference + else { + + // print any remaining pileup data from previous reference + while ( !CurrentAlignments.empty() ) { + ApplyVisitors(); + ++CurrentPosition; + } + + // store first entry on this new reference, update markers + CurrentAlignments.clear(); + CurrentAlignments.push_back(al); + CurrentId = al.RefID; + CurrentPosition = al.Position; + } + + return true; +} + +void PileupEngine::PileupEnginePrivate::ApplyVisitors(void) { + + // parse CIGAR data in BamAlignments to build up current pileup data + CreatePileupData(); + + // apply all visitors to current alignment set + vector::const_iterator visitorIter = Visitors.begin(); + vector::const_iterator visitorEnd = Visitors.end(); + for ( ; visitorIter != visitorEnd; ++visitorIter ) + (*visitorIter)->Visit(CurrentPileupData); +} + +void PileupEngine::PileupEnginePrivate::ClearOldData(void) { + + // remove any data that ends before CurrentPosition + size_t i = 0; + while ( i < CurrentAlignments.size() ) { + + // remove alignment if it ends before CurrentPosition + const int endPosition = CurrentAlignments[i].GetEndPosition(); + if ( endPosition < CurrentPosition ) + CurrentAlignments.erase(CurrentAlignments.begin() + i); + else + ++i; + } +} + +void PileupEngine::PileupEnginePrivate::CreatePileupData(void) { + + // remove any non-overlapping alignments + ClearOldData(); + + // set pileup refId, position to current markers + CurrentPileupData.RefId = CurrentId; + CurrentPileupData.Position = CurrentPosition; + CurrentPileupData.PileupAlignments.clear(); + + // parse CIGAR data in remaining alignments + vector::const_iterator alIter = CurrentAlignments.begin(); + vector::const_iterator alEnd = CurrentAlignments.end(); + for ( ; alIter != alEnd; ++alIter ) + ParseAlignmentCigar( (*alIter) ); +} + +void PileupEngine::PileupEnginePrivate::Flush(void) { + while ( !CurrentAlignments.empty() ) { + ApplyVisitors(); + ++CurrentPosition; + } +} + +void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al) { + + // skip if unmapped + if ( !al.IsMapped() ) return; + + // intialize local variables + int genomePosition = al.Position; + int positionInAlignment = 0; + bool isNewReadSegment = true; + bool saveAlignment = true; + PileupAlignment pileupAlignment(al); + + // iterate over CIGAR operations + const int numCigarOps = (const int)al.CigarData.size(); + for (int i = 0; i < numCigarOps; ++i ) { + const CigarOp& op = al.CigarData.at(i); + + // if op is MATCH + if ( op.Type == 'M' ) { + + // if match op overlaps current position + if ( genomePosition + (int)op.Length > CurrentPosition ) { + + // set pileup data + pileupAlignment.IsCurrentDeletion = false; + pileupAlignment.IsNextDeletion = false; + pileupAlignment.IsNextInsertion = false; + pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition); + + // check for beginning of read segment + if ( genomePosition == CurrentPosition && isNewReadSegment ) + pileupAlignment.IsSegmentBegin = true; + + // if we're at the end of a match operation + if ( genomePosition + (int)op.Length - 1 == CurrentPosition ) { + + // if not last operation + if ( i < numCigarOps - 1 ) { + + // check next CIGAR op + const CigarOp& nextOp = al.CigarData.at(i+1); + + // if next CIGAR op is DELETION + if ( nextOp.Type == 'D') { + pileupAlignment.IsNextDeletion = true; + pileupAlignment.DeletionLength = nextOp.Length; + } + + // if next CIGAR op is INSERTION + else if ( nextOp.Type == 'I' ) { + pileupAlignment.IsNextInsertion = true; + pileupAlignment.InsertionLength = nextOp.Length; + } + + // if next CIGAR op is either DELETION or INSERTION + if ( nextOp.Type == 'D' || nextOp.Type == 'I' ) { + + // if there is a CIGAR op after the DEL/INS + if ( i < numCigarOps - 2 ) { + const CigarOp& nextNextOp = al.CigarData.at(i+2); + + // if next CIGAR op is clipping or ref_skip + if ( nextNextOp.Type == 'S' || + nextNextOp.Type == 'N' || + nextNextOp.Type == 'H' ) + pileupAlignment.IsSegmentEnd = true; + } + else { + pileupAlignment.IsSegmentEnd = true; + + // if next CIGAR op is clipping or ref_skip + if ( nextOp.Type == 'S' || + nextOp.Type == 'N' || + nextOp.Type == 'H' ) + pileupAlignment.IsSegmentEnd = true; + } + } + + // otherwise + else { + + // if next CIGAR op is clipping or ref_skip + if ( nextOp.Type == 'S' || + nextOp.Type == 'N' || + nextOp.Type == 'H' ) + pileupAlignment.IsSegmentEnd = true; + } + } + + // else this is last operation + else pileupAlignment.IsSegmentEnd = true; + } + } + + // increment markers + genomePosition += op.Length; + positionInAlignment += op.Length; + } + + // if op is DELETION + else if ( op.Type == 'D' ) { + + // if deletion op overlaps current position + if ( genomePosition + (int)op.Length > CurrentPosition ) { + + // set pileup data + pileupAlignment.IsCurrentDeletion = true; + pileupAlignment.IsNextDeletion = false; + pileupAlignment.IsNextInsertion = true; + pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition); + } + + // increment marker + genomePosition += op.Length; + } + + // if op is REF_SKIP + else if ( op.Type == 'N' ) { + genomePosition += op.Length; + } + + // if op is INSERTION or SOFT_CLIP + else if ( op.Type == 'I' || op.Type == 'S' ) { + positionInAlignment += op.Length; + } + + // checl for beginning of new read segment + if ( op.Type == 'N' || + op.Type == 'S' || + op.Type == 'H' ) + isNewReadSegment = true; + else + isNewReadSegment = false; + + // if we've moved beyond current position + if ( genomePosition > CurrentPosition ) { + if ( op.Type == 'N' ) saveAlignment = false; // ignore alignment if REF_SKIP + break; + } + } + + // save pileup position if flag is true + if ( saveAlignment ) + CurrentPileupData.PileupAlignments.push_back( pileupAlignment ); +} + +// --------------------------------------------- +// PileupEngine implementation + +PileupEngine::PileupEngine(void) + : d( new PileupEnginePrivate ) +{ } + +PileupEngine::~PileupEngine(void) { + delete d; + d = 0; +} + +bool PileupEngine::AddAlignment(const BamAlignment& al) { return d->AddAlignment(al); } +void PileupEngine::AddVisitor(PileupVisitor* visitor) { d->Visitors.push_back(visitor); } +void PileupEngine::Flush(void) { d->Flush(); } diff --git a/src/utils/bamtools_pileup_engine.h b/src/utils/bamtools_pileup_engine.h new file mode 100644 index 0000000..675b8e6 --- /dev/null +++ b/src/utils/bamtools_pileup_engine.h @@ -0,0 +1,94 @@ +// *************************************************************************** +// bamtools_pileup_engine.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 16 September 2010 +// --------------------------------------------------------------------------- +// Provides pileup at position functionality for various tools. +// *************************************************************************** + +#ifndef BAMTOOLS_PILEUP_ENGINE_H +#define BAMTOOLS_PILEUP_ENGINE_H + +#include +#include "BamAux.h" + +namespace BamTools { + +// contains auxiliary data about a single BamAlignment +// at current position considered +struct PileupAlignment { + + // data members + BamAlignment Alignment; + int32_t PositionInAlignment; + bool IsCurrentDeletion; + bool IsNextDeletion; + bool IsNextInsertion; + int DeletionLength; + int InsertionLength; + bool IsSegmentBegin; + bool IsSegmentEnd; + + // ctor + PileupAlignment(const BamAlignment& al) + : Alignment(al) + , PositionInAlignment(-1) + , IsCurrentDeletion(false) + , IsNextDeletion(false) + , IsNextInsertion(false) + , DeletionLength(0) + , InsertionLength(0) + , IsSegmentBegin(false) + , IsSegmentEnd(false) + { } +}; + +// contains all data at a position +struct PileupPosition { + + // data members + int RefId; + int Position; + std::vector PileupAlignments; + + // ctor + PileupPosition(const int& refId = 0, + const int& position = 0, + const std::vector& alignments = std::vector()) + : RefId(refId) + , Position(position) + , PileupAlignments(alignments) + { } +}; + +class PileupVisitor { + + public: + PileupVisitor(void) { } + virtual ~PileupVisitor(void) { } + + public: + virtual void Visit(const PileupPosition& pileupData) =0; +}; + +class PileupEngine { + + public: + PileupEngine(void); + ~PileupEngine(void); + + public: + bool AddAlignment(const BamAlignment& al); + void AddVisitor(PileupVisitor* visitor); + void Flush(void); + + private: + struct PileupEnginePrivate; + PileupEnginePrivate* d; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_PILEUP_ENGINE_H \ No newline at end of file