From: Derek Date: Tue, 13 Jul 2010 01:53:57 +0000 (-0400) Subject: Begun basic pileup conversion X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=4aea04656e2666bd568cd9bf59b22b635cf1aef5;p=bamtools.git Begun basic pileup conversion --- diff --git a/bamtools_convert.cpp b/bamtools_convert.cpp index d395563..771feb2 100644 --- a/bamtools_convert.cpp +++ b/bamtools_convert.cpp @@ -16,6 +16,7 @@ #include "bamtools_convert.h" #include "bamtools_options.h" +#include "bamtools_pileup.h" #include "bamtools_utilities.h" #include "BGZF.h" #include "BamReader.h" @@ -33,6 +34,7 @@ namespace BamTools { static const string FORMAT_FASTQ = "fastq"; static const string FORMAT_JSON = "json"; static const string FORMAT_SAM = "sam"; + static const string FORMAT_PILEUP = "pileup"; static const string FORMAT_WIGGLE = "wig"; // other constants @@ -79,11 +81,18 @@ struct ConvertTool::ConvertSettings { bool HasFormat; bool HasRegion; + // pileup flags + bool HasFastaFilename; + bool IsPrintingPileupMapQualities; + // options vector InputFiles; string OutputFilename; string Format; string Region; + + // pileup options + string FastaFilename; // constructor ConvertSettings(void) @@ -91,6 +100,8 @@ struct ConvertTool::ConvertSettings { , HasOutputFilename(false) , HasFormat(false) , HasRegion(false) + , HasFastaFilename(false) + , IsPrintingPileupMapQualities(false) , OutputFilename(Options::StandardOut()) { } }; @@ -114,6 +125,10 @@ ConvertTool::ConvertTool(void) OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); + + OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options"); + Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, ""); + Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts); } ConvertTool::~ConvertTool(void) { @@ -155,6 +170,8 @@ ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { } bool ConvertTool::ConvertToolPrivate::Run(void) { + bool convertedOk = true; + // ------------------------------------ // initialize conversion input/output @@ -162,51 +179,84 @@ bool ConvertTool::ConvertToolPrivate::Run(void) { if ( !m_settings->HasInputFilenames ) m_settings->InputFiles.push_back(Options::StandardIn()); - // open files + // open input files BamMultiReader reader; reader.Open(m_settings->InputFiles); m_references = reader.GetReferenceData(); + // set region if specified BamRegion region; - if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - if ( !reader.SetRegion(region) ) { - cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl; + if ( m_settings->HasRegion ) { + if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { + if ( !reader.SetRegion(region) ) + cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl; } } - // if an output filename given, open outfile + // if output file given ofstream outFile; - if ( m_settings->HasOutputFilename ) { + if ( m_settings->HasOutputFilename ) { + + // open output file stream outFile.open(m_settings->OutputFilename.c_str()); - if (!outFile) { cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; return false; } - m_out.rdbuf(outFile.rdbuf()); // set m_out to file's streambuf + if ( !outFile ) { + cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; + return false; + } + + // set m_out to file's streambuf + m_out.rdbuf(outFile.rdbuf()); } // ------------------------ - // determine format type - bool formatError = false; - bool convertedOk = true; - void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0; - if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; - else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph; - else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta; - else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq; - else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson; - else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam; - else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle; - else { - cerr << "Unrecognized format: " << m_settings->Format << endl; - cerr << "Please see help|README (?) for details on supported formats " << endl; - formatError = true; - convertedOk = false; + // pileup is special case + if ( m_settings->Format == FORMAT_PILEUP ) { + + // initialize pileup input/output + Pileup pileup(&reader, &m_out); + + // --------------------------- + // configure pileup settings + + if ( m_settings->HasRegion ) + pileup.SetRegion(region); + + if ( m_settings->HasFastaFilename ) + pileup.SetFastaFilename(m_settings->FastaFilename); + + pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities ); + + // run pileup + convertedOk = pileup.Run(); } - // ------------------------ - // do conversion - if ( !formatError ) { - BamAlignment a; - while ( reader.GetNextAlignment(a) ) { - (this->*pFunction)(a); + // ------------------------------------- + // else determine 'simpler' format type + else { + + bool formatError = false; + void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0; + if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; + else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph; + else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta; + else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq; + else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson; + else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam; + else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle; + else { + cerr << "Unrecognized format: " << m_settings->Format << endl; + cerr << "Please see help|README (?) for details on supported formats " << endl; + formatError = true; + convertedOk = false; + } + + // ------------------------ + // do conversion + if ( !formatError ) { + BamAlignment a; + while ( reader.GetNextAlignment(a) ) { + (this->*pFunction)(a); + } } } diff --git a/bamtools_pileup.cpp b/bamtools_pileup.cpp new file mode 100644 index 0000000..5b84ead --- /dev/null +++ b/bamtools_pileup.cpp @@ -0,0 +1,218 @@ +#include +#include "BamMultiReader.h" +#include "bamtools_pileup.h" +using namespace std; +using namespace BamTools; + +struct Pileup::PileupPrivate { + + // --------------------- + // data members + + // IO & settings + BamMultiReader* Reader; + ostream* OutStream; + string FastaFilename; + bool IsPrintingMapQualities; + BamRegion Region; + + // parsing data + int CurrentId; + int CurrentPosition; + vector CurrentData; + RefVector References; + + // ---------------------- + // ctor + + PileupPrivate(BamMultiReader* reader, ostream* outStream) + : Reader(reader) + , OutStream(outStream) + , FastaFilename("") + , IsPrintingMapQualities(false) + { } + + // ---------------------- + // internal methods + + void PrintCurrentData(void); + bool Run(void); +}; + +void Pileup::PileupPrivate::PrintCurrentData(void) { + + // remove any data that ends before CurrentPosition + size_t i = 0; + while ( i < CurrentData.size() ) { + if ( CurrentData[i].GetEndPosition() < CurrentPosition ) + CurrentData.erase(CurrentData.begin() + i); + else + ++i; + } + + // if not data remains, return + if ( CurrentData.empty() ) return; + + // initialize empty strings + string bases = ""; + string baseQuals = ""; + string mapQuals = ""; + + // iterate over alignments + vector::const_iterator dataIter = CurrentData.begin(); + vector::const_iterator dataEnd = CurrentData.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + + // retrieve alignment + const BamAlignment& al = (*dataIter); + + // determine current base character & store + const char base = al.AlignedBases[CurrentPosition -al.Position]; + if ( al.IsReverseStrand() ) + bases.push_back( tolower(base) ); + else + bases.push_back( toupper(base) ); + + // determine current base quality & store + baseQuals.push_back( al.Qualities[CurrentPosition - al.Position] ); + + // if using mapQuals, determine current mapQual & store + if ( IsPrintingMapQualities ) { + int mapQuality = (int)(al.MapQuality + 33); + if ( mapQuality > 126 ) mapQuality = 126; + mapQuals.push_back((char)mapQuality); + } + } + + // print results to OutStream + const string& refName = References[CurrentId].RefName; + const char refBase = 'N'; + + *OutStream << refName << "\t" << CurrentPosition << "\t" << refBase << "\t" << CurrentData.size() << "\t" << bases << "\t" << baseQuals; + if ( IsPrintingMapQualities ) *OutStream << "\t" << mapQuals; + *OutStream << endl; +} + +bool Pileup::PileupPrivate::Run(void) { + + // ----------------------------- + // validate input & output + + if ( !Reader ) { + cerr << "Pileup::Run() : Invalid multireader" << endl; + return false; + } + + if ( !OutStream) { + cerr << "Pileup::Run() : Invalid output stream" << endl; + return false; + } + + References = Reader->GetReferenceData(); + + // ----------------------------- + // process input data + + // get first entry + BamAlignment al; + if ( !Reader->GetNextAlignment(al) ) { + cerr << "Pileup::Run() : Could not read from multireader" << endl; + return false; + } + + // set initial markers & store first entry + CurrentId = al.RefID; + CurrentPosition = al.Position; + CurrentData.clear(); + CurrentData.push_back(al); + + // iterate over remaining data + while ( Reader->GetNextAlignment(al) ) { + + // if same reference + if ( al.RefID == CurrentId ) { + + // if same position, store and move on + if ( al.Position == CurrentPosition ) + CurrentData.push_back(al); + + // if less than CurrentPosition - sorting error => ABORT + else if ( al.Position < CurrentPosition ) { + cerr << "Pileup::Run() : Data not sorted correctly!" << endl; + return false; + } + + // else print pileup data until 'catching up' to CurrentPosition + else { + while ( al.Position > CurrentPosition ) { + PrintCurrentData(); + ++CurrentPosition; + } + CurrentData.push_back(al); + } + } + + // if reference ID less than CurrentID - sorting error => ABORT + else if ( al.RefID < CurrentId ) { + cerr << "Pileup::Run() : Data not sorted correctly!" << endl; + return false; + } + + // else moved forward onto next reference + else { + + // print any remaining pileup data from previous reference + while ( !CurrentData.empty() ) { + PrintCurrentData(); + ++CurrentPosition; + } + + // store first entry on this new reference, update markers + CurrentData.clear(); + CurrentData.push_back(al); + CurrentId = al.RefID; + CurrentPosition = al.Position; + } + } + + // ------------------------------------ + // handle any remaining data entries + + while ( !CurrentData.empty() ) { + PrintCurrentData(); + ++CurrentPosition; + } + + // ------------------------- + // return success + + return true; +} + +// ---------------------------------------------------------- +// Pileup implementation + +Pileup::Pileup(BamMultiReader* reader, ostream* outStream) { + d = new PileupPrivate(reader, outStream); +} + +Pileup::~Pileup(void) { + delete d; + d = 0; +} + +bool Pileup::Run(void) { + return d->Run(); +} + +void Pileup::SetFastaFilename(const string& filename) { + d->FastaFilename = filename; +} + +void Pileup::SetIsPrintingMapQualities(bool ok) { + d->IsPrintingMapQualities = ok; +} + +void Pileup::SetRegion(const BamRegion& region) { + d->Region = region; +} diff --git a/bamtools_pileup.h b/bamtools_pileup.h new file mode 100644 index 0000000..b28059c --- /dev/null +++ b/bamtools_pileup.h @@ -0,0 +1,31 @@ +#ifndef BAMTOOLS_PILEUP_H +#define BAMTOOLS_PILEUP_H + +#include +#include + +namespace BamTools { + +class BamMultiReader; +class BamRegion; + +class Pileup { + + public: + Pileup(BamMultiReader* reader, std::ostream* outStream); + ~Pileup(void); + + public: + bool Run(void); + void SetFastaFilename(const std::string& filename); + void SetIsPrintingMapQualities(bool ok); + void SetRegion(const BamRegion& region); + + private: + struct PileupPrivate; + PileupPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_PILEUP_H \ No newline at end of file