X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Ftoolkit%2Fbamtools_convert.cpp;fp=src%2Ftoolkit%2Fbamtools_convert.cpp;h=1c5ba4884ac73412f9e45a38575d9fba6de4dc88;hb=0180d7256f5c781364b04559b39ed2db41585787;hp=0000000000000000000000000000000000000000;hpb=acdea352102b176dc519f6cbdd7980d1041079c2;p=bamtools.git diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp new file mode 100644 index 0000000..1c5ba48 --- /dev/null +++ b/src/toolkit/bamtools_convert.cpp @@ -0,0 +1,619 @@ +// *************************************************************************** +// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 July 2010 +// --------------------------------------------------------------------------- +// Converts between BAM and a number of other formats +// *************************************************************************** + +#include +#include +#include +#include +#include + +#include "bamtools_convert.h" +#include "bamtools_options.h" +#include "bamtools_pileup.h" +#include "bamtools_utilities.h" +#include "BGZF.h" +#include "BamReader.h" +#include "BamMultiReader.h" + +using namespace std; +using namespace BamTools; + +namespace BamTools { + + // format names + static const string FORMAT_BED = "bed"; + static const string FORMAT_BEDGRAPH = "bedgraph"; + static const string FORMAT_FASTA = "fasta"; + static const string FORMAT_FASTQ = "fastq"; + static const string FORMAT_JSON = "json"; + static const string FORMAT_SAM = "sam"; + static const string FORMAT_PILEUP = "pileup"; + static const string FORMAT_WIGGLE = "wig"; + + // other constants + static const unsigned int FASTA_LINE_MAX = 50; + +} // namespace BamTools + +struct ConvertTool::ConvertToolPrivate { + + // ctor & dtor + public: + ConvertToolPrivate(ConvertTool::ConvertSettings* settings); + ~ConvertToolPrivate(void); + + // interface + public: + bool Run(void); + + // internal methods + private: + void PrintBed(const BamAlignment& a); + void PrintBedGraph(const BamAlignment& a); + void PrintFasta(const BamAlignment& a); + void PrintFastq(const BamAlignment& a); + void PrintJson(const BamAlignment& a); + void PrintSam(const BamAlignment& a); + void PrintWiggle(const BamAlignment& a); + + // data members + private: + ConvertTool::ConvertSettings* m_settings; + RefVector m_references; + ostream m_out; +}; + +// --------------------------------------------- +// ConvertSettings implementation + +struct ConvertTool::ConvertSettings { + + // flags + bool HasInput; + bool HasOutput; + bool HasFormat; + bool HasRegion; + + // pileup flags + bool HasFastaFilename; + bool IsOmittingSamHeader; + bool IsPrintingPileupMapQualities; + + // options + vector InputFiles; + string OutputFilename; + string Format; + string Region; + + // pileup options + string FastaFilename; + + // constructor + ConvertSettings(void) + : HasInput(false) + , HasOutput(false) + , HasFormat(false) + , HasRegion(false) + , HasFastaFilename(false) + , IsOmittingSamHeader(false) + , IsPrintingPileupMapQualities(false) + , OutputFilename(Options::StandardOut()) + { } +}; + +// --------------------------------------------- +// ConvertTool implementation + +ConvertTool::ConvertTool(void) + : AbstractTool() + , m_settings(new ConvertSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format [-in -in ...] [-out ] [other options]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts); + + OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); + Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); + + OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options"); + Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, ""); + Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts); + + OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options"); + Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts); +} + +ConvertTool::~ConvertTool(void) { + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int ConvertTool::Help(void) { + Options::DisplayHelp(); + return 0; +} + +int ConvertTool::Run(int argc, char* argv[]) { + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // run internal ConvertTool implementation, return success/fail + m_impl = new ConvertToolPrivate(m_settings); + + if ( m_impl->Run() ) + return 0; + else + return 1; +} + +// --------------------------------------------- +// ConvertToolPrivate implementation + +ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings) + : m_settings(settings) + , m_out(cout.rdbuf()) // default output to cout +{ } + +ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { } + +bool ConvertTool::ConvertToolPrivate::Run(void) { + + bool convertedOk = true; + + // ------------------------------------ + // initialize conversion input/output + + // set to default input if none provided + if ( !m_settings->HasInput ) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // open input files + BamMultiReader reader; + reader.Open(m_settings->InputFiles, false); + m_references = reader.GetReferenceData(); + + // set region if specified + BamRegion region; + if ( m_settings->HasRegion ) { + if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { + if ( !reader.SetRegion(region) ) + cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl; + } + } + + // if output file given + ofstream outFile; + if ( m_settings->HasOutput ) { + + // open output file stream + outFile.open(m_settings->OutputFilename.c_str()); + if ( !outFile ) { + cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; + return false; + } + + // set m_out to file's streambuf + m_out.rdbuf(outFile.rdbuf()); + } + + // ------------------------ + // pileup is special case + if ( m_settings->Format == FORMAT_PILEUP ) { + + // initialize pileup input/output + Pileup pileup(&reader, &m_out); + + // --------------------------- + // configure pileup settings + + if ( m_settings->HasRegion ) + pileup.SetRegion(region); + + if ( m_settings->HasFastaFilename ) + pileup.SetFastaFilename(m_settings->FastaFilename); + + pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities ); + + // run pileup + convertedOk = pileup.Run(); + } + + // ------------------------------------- + // else determine 'simpler' format type + else { + + bool formatError = false; + void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0; + if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; + else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph; + else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta; + else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq; + else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson; + else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam; + else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle; + else { + cerr << "Unrecognized format: " << m_settings->Format << endl; + cerr << "Please see help|README (?) for details on supported formats " << endl; + formatError = true; + convertedOk = false; + } + + // if SAM format & not omitting header, print SAM header + if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) { + string headerText = reader.GetHeaderText(); + m_out << headerText; + } + + // ------------------------ + // do conversion + if ( !formatError ) { + BamAlignment a; + while ( reader.GetNextAlignment(a) ) { + (this->*pFunction)(a); + } + } + } + + // ------------------------ + // clean up & exit + reader.Close(); + if ( m_settings->HasOutput ) outFile.close(); + return convertedOk; +} + +// ---------------------------------------------------------- +// Conversion/output methods +// ---------------------------------------------------------- + +void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) { + + // tab-delimited, 0-based half-open + // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) ) + // + + m_out << m_references.at(a.RefID).RefName << "\t" + << a.Position << "\t" + << a.GetEndPosition() + 1 << "\t" + << a.Name << "\t" + << a.MapQuality << "\t" + << (a.IsReverseStrand() ? "-" : "+") << endl; +} + +void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) { + ; +} + +// print BamAlignment in FASTA format +// N.B. - uses QueryBases NOT AlignedBases +void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { + + // >BamAlignment.Name + // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line) + // ... + + // print header + m_out << "> " << a.Name << endl; + + // if sequence fits on single line + if ( a.QueryBases.length() <= FASTA_LINE_MAX ) + m_out << a.QueryBases << endl; + + // else split over multiple lines + else { + + size_t position = 0; + size_t seqLength = a.QueryBases.length(); + + // write subsequences to each line + while ( position < (seqLength - FASTA_LINE_MAX) ) { + m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl; + position += FASTA_LINE_MAX; + } + + // write final subsequence + m_out << a.QueryBases.substr(position) << endl; + } +} + +// print BamAlignment in FASTQ format +// N.B. - uses QueryBases NOT AlignedBases +void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { + + // @BamAlignment.Name + // BamAlignment.QueryBases + // + + // BamAlignment.Qualities + + m_out << "@" << a.Name << endl + << a.QueryBases << endl + << "+" << endl + << a.Qualities << endl; +} + +// print BamAlignment in JSON format +void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { + + // write name & alignment flag + m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\","; + + // write reference name + if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) + m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\","; + + // write position & map quality + m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ","; + + // write CIGAR + const vector& cigarData = a.CigarData; + if ( !cigarData.empty() ) { + m_out << "\"cigar\":["; + vector::const_iterator cigarBegin = cigarData.begin(); + vector::const_iterator cigarIter = cigarBegin; + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + const CigarOp& op = (*cigarIter); + if (cigarIter != cigarBegin) m_out << ","; + m_out << "\"" << op.Length << op.Type << "\""; + } + m_out << "],"; + } + + // write mate reference name, mate position, & insert size + if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { + m_out << "\"mate\":{" + << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\"," + << "\"position\":" << a.MatePosition+1 + << ",\"insertSize\":" << a.InsertSize << "},"; + } + + // write sequence + if ( !a.QueryBases.empty() ) + m_out << "\"queryBases\":\"" << a.QueryBases << "\","; + + // write qualities + if ( !a.Qualities.empty() ) { + string::const_iterator s = a.Qualities.begin(); + m_out << "\"qualities\":[" << static_cast(*s) - 33; + ++s; + for (; s != a.Qualities.end(); ++s) { + m_out << "," << static_cast(*s) - 33; + } + m_out << "],"; + } + + // write tag data + const char* tagData = a.TagData.c_str(); + const size_t tagDataLength = a.TagData.length(); + size_t index = 0; + if (index < tagDataLength) { + + m_out << "\"tags\":{"; + + while ( index < tagDataLength ) { + + if (index > 0) + m_out << ","; + + // write tag name + m_out << "\"" << a.TagData.substr(index, 2) << "\":"; + index += 2; + + // get data type + char type = a.TagData.at(index); + ++index; + + switch (type) { + case('A') : + m_out << "\"" << tagData[index] << "\""; + ++index; + break; + + case('C') : + m_out << (int)tagData[index]; + ++index; + break; + + case('c') : + m_out << (int)tagData[index]; + ++index; + break; + + case('S') : + m_out << BgzfData::UnpackUnsignedShort(&tagData[index]); + index += 2; + break; + + case('s') : + m_out << BgzfData::UnpackSignedShort(&tagData[index]); + index += 2; + break; + + case('I') : + m_out << BgzfData::UnpackUnsignedInt(&tagData[index]); + index += 4; + break; + + case('i') : + m_out << BgzfData::UnpackSignedInt(&tagData[index]); + index += 4; + break; + + case('f') : + m_out << BgzfData::UnpackFloat(&tagData[index]); + index += 4; + break; + + case('d') : + m_out << BgzfData::UnpackDouble(&tagData[index]); + index += 8; + break; + + case('Z') : + case('H') : + m_out << "\""; + while (tagData[index]) { + m_out << tagData[index]; + ++index; + } + m_out << "\""; + ++index; + break; + } + + if ( tagData[index] == '\0') + break; + } + + m_out << "}"; + } + + m_out << "}" << endl; + +} + +// print BamAlignment in SAM format +void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { + + // tab-delimited + // [ :: [...] ] + + // write name & alignment flag + m_out << a.Name << "\t" << a.AlignmentFlag << "\t"; + + // write reference name + if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) + m_out << m_references[a.RefID].RefName << "\t"; + else + m_out << "*\t"; + + // write position & map quality + m_out << a.Position+1 << "\t" << a.MapQuality << "\t"; + + // write CIGAR + const vector& cigarData = a.CigarData; + if ( cigarData.empty() ) m_out << "*\t"; + else { + vector::const_iterator cigarIter = cigarData.begin(); + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + const CigarOp& op = (*cigarIter); + m_out << op.Length << op.Type; + } + m_out << "\t"; + } + + // write mate reference name, mate position, & insert size + if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { + if ( a.MateRefID == a.RefID ) m_out << "=\t"; + else m_out << m_references[a.MateRefID].RefName << "\t"; + m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; + } + else m_out << "*\t0\t0\t"; + + // write sequence + if ( a.QueryBases.empty() ) m_out << "*\t"; + else m_out << a.QueryBases << "\t"; + + // write qualities + if ( a.Qualities.empty() ) m_out << "*"; + else m_out << a.Qualities; + + // write tag data + const char* tagData = a.TagData.c_str(); + const size_t tagDataLength = a.TagData.length(); + + size_t index = 0; + while ( index < tagDataLength ) { + + // write tag name + string tagName = a.TagData.substr(index, 2); + m_out << "\t" << tagName << ":"; + index += 2; + + // get data type + char type = a.TagData.at(index); + ++index; + switch (type) { + case('A') : + m_out << "A:" << tagData[index]; + ++index; + break; + + case('C') : + m_out << "i:" << (int)tagData[index]; + ++index; + break; + + case('c') : + m_out << "i:" << (int)tagData[index]; + ++index; + break; + + case('S') : + m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]); + index += 2; + break; + + case('s') : + m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]); + index += 2; + break; + + case('I') : + m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]); + index += 4; + break; + + case('i') : + m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]); + index += 4; + break; + + case('f') : + m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]); + index += 4; + break; + + case('d') : + m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]); + index += 8; + break; + + case('Z') : + case('H') : + m_out << type << ":"; + while (tagData[index]) { + m_out << tagData[index]; + ++index; + } + ++index; + break; + } + + if ( tagData[index] == '\0') + break; + } + + m_out << endl; +} + +void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) { + ; +}