1 // ***************************************************************************
2 // bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 22 July 2010
7 // ---------------------------------------------------------------------------
8 // Converts between BAM and a number of other formats
9 // ***************************************************************************
17 #include "bamtools_convert.h"
18 #include "bamtools_options.h"
19 #include "bamtools_pileup.h"
20 #include "bamtools_utilities.h"
22 #include "BamReader.h"
23 #include "BamMultiReader.h"
26 using namespace BamTools;
31 static const string FORMAT_BED = "bed";
32 static const string FORMAT_BEDGRAPH = "bedgraph";
33 static const string FORMAT_FASTA = "fasta";
34 static const string FORMAT_FASTQ = "fastq";
35 static const string FORMAT_JSON = "json";
36 static const string FORMAT_SAM = "sam";
37 static const string FORMAT_PILEUP = "pileup";
38 static const string FORMAT_WIGGLE = "wig";
41 static const unsigned int FASTA_LINE_MAX = 50;
43 } // namespace BamTools
45 struct ConvertTool::ConvertToolPrivate {
49 ConvertToolPrivate(ConvertTool::ConvertSettings* settings);
50 ~ConvertToolPrivate(void);
58 void PrintBed(const BamAlignment& a);
59 void PrintBedGraph(const BamAlignment& a);
60 void PrintFasta(const BamAlignment& a);
61 void PrintFastq(const BamAlignment& a);
62 void PrintJson(const BamAlignment& a);
63 void PrintSam(const BamAlignment& a);
64 void PrintWiggle(const BamAlignment& a);
68 ConvertTool::ConvertSettings* m_settings;
69 RefVector m_references;
73 // ---------------------------------------------
74 // ConvertSettings implementation
76 struct ConvertTool::ConvertSettings {
85 bool HasFastaFilename;
86 bool IsOmittingSamHeader;
87 bool IsPrintingPileupMapQualities;
90 vector<string> InputFiles;
91 string OutputFilename;
104 , HasFastaFilename(false)
105 , IsOmittingSamHeader(false)
106 , IsPrintingPileupMapQualities(false)
107 , OutputFilename(Options::StandardOut())
111 // ---------------------------------------------
112 // ConvertTool implementation
114 ConvertTool::ConvertTool(void)
116 , m_settings(new ConvertSettings)
119 // set program details
120 Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format <FORMAT> [-in <filename> -in <filename> ...] [-out <filename>] [other options]");
123 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
124 Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
125 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
126 Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
128 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
129 Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
131 OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options");
132 Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, "");
133 Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts);
135 OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options");
136 Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts);
139 ConvertTool::~ConvertTool(void) {
147 int ConvertTool::Help(void) {
148 Options::DisplayHelp();
152 int ConvertTool::Run(int argc, char* argv[]) {
154 // parse command line arguments
155 Options::Parse(argc, argv, 1);
157 // run internal ConvertTool implementation, return success/fail
158 m_impl = new ConvertToolPrivate(m_settings);
166 // ---------------------------------------------
167 // ConvertToolPrivate implementation
169 ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
170 : m_settings(settings)
171 , m_out(cout.rdbuf()) // default output to cout
174 ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
176 bool ConvertTool::ConvertToolPrivate::Run(void) {
178 bool convertedOk = true;
180 // ------------------------------------
181 // initialize conversion input/output
183 // set to default input if none provided
184 if ( !m_settings->HasInput )
185 m_settings->InputFiles.push_back(Options::StandardIn());
188 BamMultiReader reader;
189 reader.Open(m_settings->InputFiles, false);
190 m_references = reader.GetReferenceData();
192 // set region if specified
194 if ( m_settings->HasRegion ) {
195 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
196 if ( !reader.SetRegion(region) )
197 cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
201 // if output file given
203 if ( m_settings->HasOutput ) {
205 // open output file stream
206 outFile.open(m_settings->OutputFilename.c_str());
208 cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl;
212 // set m_out to file's streambuf
213 m_out.rdbuf(outFile.rdbuf());
216 // ------------------------
217 // pileup is special case
218 if ( m_settings->Format == FORMAT_PILEUP ) {
220 // initialize pileup input/output
221 Pileup pileup(&reader, &m_out);
223 // ---------------------------
224 // configure pileup settings
226 if ( m_settings->HasRegion )
227 pileup.SetRegion(region);
229 if ( m_settings->HasFastaFilename )
230 pileup.SetFastaFilename(m_settings->FastaFilename);
232 pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities );
235 convertedOk = pileup.Run();
238 // -------------------------------------
239 // else determine 'simpler' format type
242 bool formatError = false;
243 void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
244 if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
245 else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
246 else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
247 else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
248 else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
249 else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
250 else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
252 cerr << "Unrecognized format: " << m_settings->Format << endl;
253 cerr << "Please see help|README (?) for details on supported formats " << endl;
258 // if SAM format & not omitting header, print SAM header
259 if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) {
260 string headerText = reader.GetHeaderText();
264 // ------------------------
266 if ( !formatError ) {
268 while ( reader.GetNextAlignment(a) ) {
269 (this->*pFunction)(a);
274 // ------------------------
277 if ( m_settings->HasOutput ) outFile.close();
281 // ----------------------------------------------------------
282 // Conversion/output methods
283 // ----------------------------------------------------------
285 void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) {
287 // tab-delimited, 0-based half-open
288 // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
289 // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>
291 m_out << m_references.at(a.RefID).RefName << "\t"
292 << a.Position << "\t"
293 << a.GetEndPosition() + 1 << "\t"
295 << a.MapQuality << "\t"
296 << (a.IsReverseStrand() ? "-" : "+") << endl;
299 void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) {
303 // print BamAlignment in FASTA format
304 // N.B. - uses QueryBases NOT AlignedBases
305 void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) {
307 // >BamAlignment.Name
308 // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
312 m_out << "> " << a.Name << endl;
314 // if sequence fits on single line
315 if ( a.QueryBases.length() <= FASTA_LINE_MAX )
316 m_out << a.QueryBases << endl;
318 // else split over multiple lines
322 size_t seqLength = a.QueryBases.length();
324 // write subsequences to each line
325 while ( position < (seqLength - FASTA_LINE_MAX) ) {
326 m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl;
327 position += FASTA_LINE_MAX;
330 // write final subsequence
331 m_out << a.QueryBases.substr(position) << endl;
335 // print BamAlignment in FASTQ format
336 // N.B. - uses QueryBases NOT AlignedBases
337 void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) {
339 // @BamAlignment.Name
340 // BamAlignment.QueryBases
342 // BamAlignment.Qualities
344 m_out << "@" << a.Name << endl
345 << a.QueryBases << endl
347 << a.Qualities << endl;
350 // print BamAlignment in JSON format
351 void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) {
353 // write name & alignment flag
354 m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
356 // write reference name
357 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
358 m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
360 // write position & map quality
361 m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
364 const vector<CigarOp>& cigarData = a.CigarData;
365 if ( !cigarData.empty() ) {
366 m_out << "\"cigar\":[";
367 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
368 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
369 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
370 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
371 const CigarOp& op = (*cigarIter);
372 if (cigarIter != cigarBegin) m_out << ",";
373 m_out << "\"" << op.Length << op.Type << "\"";
378 // write mate reference name, mate position, & insert size
379 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
380 m_out << "\"mate\":{"
381 << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
382 << "\"position\":" << a.MatePosition+1
383 << ",\"insertSize\":" << a.InsertSize << "},";
387 if ( !a.QueryBases.empty() )
388 m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
391 if ( !a.Qualities.empty() ) {
392 string::const_iterator s = a.Qualities.begin();
393 m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
395 for (; s != a.Qualities.end(); ++s) {
396 m_out << "," << static_cast<short>(*s) - 33;
402 const char* tagData = a.TagData.c_str();
403 const size_t tagDataLength = a.TagData.length();
405 if (index < tagDataLength) {
407 m_out << "\"tags\":{";
409 while ( index < tagDataLength ) {
415 m_out << "\"" << a.TagData.substr(index, 2) << "\":";
419 char type = a.TagData.at(index);
424 m_out << "\"" << tagData[index] << "\"";
429 m_out << (int)tagData[index];
434 m_out << (int)tagData[index];
439 m_out << BgzfData::UnpackUnsignedShort(&tagData[index]);
444 m_out << BgzfData::UnpackSignedShort(&tagData[index]);
449 m_out << BgzfData::UnpackUnsignedInt(&tagData[index]);
454 m_out << BgzfData::UnpackSignedInt(&tagData[index]);
459 m_out << BgzfData::UnpackFloat(&tagData[index]);
464 m_out << BgzfData::UnpackDouble(&tagData[index]);
471 while (tagData[index]) {
472 m_out << tagData[index];
480 if ( tagData[index] == '\0')
487 m_out << "}" << endl;
491 // print BamAlignment in SAM format
492 void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) {
495 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
497 // write name & alignment flag
498 m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
500 // write reference name
501 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
502 m_out << m_references[a.RefID].RefName << "\t";
506 // write position & map quality
507 m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
510 const vector<CigarOp>& cigarData = a.CigarData;
511 if ( cigarData.empty() ) m_out << "*\t";
513 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
514 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
515 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
516 const CigarOp& op = (*cigarIter);
517 m_out << op.Length << op.Type;
522 // write mate reference name, mate position, & insert size
523 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
524 if ( a.MateRefID == a.RefID ) m_out << "=\t";
525 else m_out << m_references[a.MateRefID].RefName << "\t";
526 m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
528 else m_out << "*\t0\t0\t";
531 if ( a.QueryBases.empty() ) m_out << "*\t";
532 else m_out << a.QueryBases << "\t";
535 if ( a.Qualities.empty() ) m_out << "*";
536 else m_out << a.Qualities;
539 const char* tagData = a.TagData.c_str();
540 const size_t tagDataLength = a.TagData.length();
543 while ( index < tagDataLength ) {
546 string tagName = a.TagData.substr(index, 2);
547 m_out << "\t" << tagName << ":";
551 char type = a.TagData.at(index);
555 m_out << "A:" << tagData[index];
560 m_out << "i:" << (int)tagData[index];
565 m_out << "i:" << (int)tagData[index];
570 m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
575 m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
580 m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
585 m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
590 m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
595 m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
601 m_out << type << ":";
602 while (tagData[index]) {
603 m_out << tagData[index];
610 if ( tagData[index] == '\0')
617 void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) {