1 // ***************************************************************************
2 // bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 23 September 2010
7 // ---------------------------------------------------------------------------
8 // Converts between BAM and a number of other formats
9 // ***************************************************************************
16 #include "bamtools_convert.h"
17 #include "bamtools_fasta.h"
18 #include "bamtools_options.h"
19 #include "bamtools_pileup_engine.h"
20 #include "bamtools_utilities.h"
22 #include "BamMultiReader.h"
24 using namespace BamTools;
28 // ---------------------------------------------
29 // ConvertTool constants
31 // supported conversion format command-line names
32 static const string FORMAT_BED = "bed";
33 static const string FORMAT_BEDGRAPH = "bedgraph";
34 static const string FORMAT_FASTA = "fasta";
35 static const string FORMAT_FASTQ = "fastq";
36 static const string FORMAT_JSON = "json";
37 static const string FORMAT_SAM = "sam";
38 static const string FORMAT_PILEUP = "pileup";
39 static const string FORMAT_WIGGLE = "wig";
42 static const unsigned int FASTA_LINE_MAX = 50;
44 // ---------------------------------------------
45 // ConvertPileupFormatVisitor declaration
47 class ConvertPileupFormatVisitor : public PileupVisitor {
51 ConvertPileupFormatVisitor(const RefVector& references,
52 const string& fastaFilename,
53 const bool isPrintingMapQualities,
55 ~ConvertPileupFormatVisitor(void);
57 // PileupVisitor interface implementation
59 void Visit(const PileupPosition& pileupData);
65 bool m_isPrintingMapQualities;
67 RefVector m_references;
70 } // namespace BamTools
72 // ---------------------------------------------
73 // ConvertSettings implementation
75 struct ConvertTool::ConvertSettings {
84 bool HasFastaFilename;
85 bool IsOmittingSamHeader;
86 bool IsPrintingPileupMapQualities;
89 vector<string> InputFiles;
90 string OutputFilename;
103 , HasFastaFilename(false)
104 , IsOmittingSamHeader(false)
105 , IsPrintingPileupMapQualities(false)
106 , OutputFilename(Options::StandardOut())
111 // ---------------------------------------------
112 // ConvertToolPrivate implementation
114 struct ConvertTool::ConvertToolPrivate {
118 ConvertToolPrivate(ConvertTool::ConvertSettings* settings);
119 ~ConvertToolPrivate(void);
127 void PrintBed(const BamAlignment& a);
128 void PrintBedGraph(const BamAlignment& a);
129 void PrintFasta(const BamAlignment& a);
130 void PrintFastq(const BamAlignment& a);
131 void PrintJson(const BamAlignment& a);
132 void PrintSam(const BamAlignment& a);
133 void PrintWiggle(const BamAlignment& a);
135 // special case - uses the PileupEngine
136 bool RunPileupConversion(BamMultiReader* reader);
140 ConvertTool::ConvertSettings* m_settings;
141 RefVector m_references;
145 ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
146 : m_settings(settings)
147 , m_out(cout.rdbuf()) // default output to cout
150 ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
152 bool ConvertTool::ConvertToolPrivate::Run(void) {
154 // ------------------------------------
155 // initialize conversion input/output
157 // set to default input if none provided
158 if ( !m_settings->HasInput )
159 m_settings->InputFiles.push_back(Options::StandardIn());
162 BamMultiReader reader;
163 if ( !m_settings->HasInput ) { // don't attempt to open index for stdin
164 if ( !reader.Open(m_settings->InputFiles, false) ) {
165 cerr << "Could not open input files" << endl;
169 if ( !reader.Open(m_settings->InputFiles, true) ) {
170 if ( !reader.Open(m_settings->InputFiles, false) ) {
171 cerr << "Could not open input files" << endl;
174 cerr << "Opened reader without index file, jumping is disabled." << endl;
178 m_references = reader.GetReferenceData();
180 // set region if specified
182 if ( m_settings->HasRegion ) {
183 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
184 if ( !reader.SetRegion(region) ) {
185 cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
189 cerr << "Could not parse REGION: " << m_settings->Region << endl;
194 // if output file given
196 if ( m_settings->HasOutput ) {
198 // open output file stream
199 outFile.open(m_settings->OutputFilename.c_str());
201 cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl;
205 // set m_out to file's streambuf
206 m_out.rdbuf(outFile.rdbuf());
209 // -------------------------------------
210 // do conversion based on format
212 bool convertedOk = true;
214 // pileup is special case
215 // conversion not done per alignment, like the other formats
216 if ( m_settings->Format == FORMAT_PILEUP )
217 convertedOk = RunPileupConversion(&reader);
222 bool formatError = false;
224 // set function pointer to proper conversion method
225 void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
226 if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
227 else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
228 else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
229 else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
230 else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
231 else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
232 else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
234 cerr << "Unrecognized format: " << m_settings->Format << endl;
235 cerr << "Please see help|README (?) for details on supported formats " << endl;
240 // if format selected ok
241 if ( !formatError ) {
243 // if SAM format & not omitting header, print SAM header first
244 if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader )
245 m_out << reader.GetHeaderText();
247 // iterate through file, doing conversion
249 while ( reader.GetNextAlignment(a) )
250 (this->*pFunction)(a);
252 // set flag for successful conversion
257 // ------------------------
261 if ( m_settings->HasOutput ) outFile.close();
265 // ----------------------------------------------------------
266 // Conversion/output methods
267 // ----------------------------------------------------------
269 void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) {
271 // tab-delimited, 0-based half-open
272 // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
273 // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>
275 m_out << m_references.at(a.RefID).RefName << "\t"
276 << a.Position << "\t"
277 << a.GetEndPosition() + 1 << "\t"
279 << a.MapQuality << "\t"
280 << (a.IsReverseStrand() ? "-" : "+") << endl;
283 void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) {
287 // print BamAlignment in FASTA format
288 // N.B. - uses QueryBases NOT AlignedBases
289 void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) {
291 // >BamAlignment.Name
292 // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
295 // N.B. - QueryBases are reverse-complemented if aligned to reverse strand
298 m_out << "> " << a.Name << endl;
300 // handle reverse strand alignment - bases
301 string sequence = a.QueryBases;
302 if ( a.IsReverseStrand() )
303 Utilities::ReverseComplement(sequence);
305 // if sequence fits on single line
306 if ( sequence.length() <= FASTA_LINE_MAX )
307 m_out << sequence << endl;
309 // else split over multiple lines
313 size_t seqLength = sequence.length(); // handle reverse strand alignment - bases & qualitiesth();
315 // write subsequences to each line
316 while ( position < (seqLength - FASTA_LINE_MAX) ) {
317 m_out << sequence.substr(position, FASTA_LINE_MAX) << endl;
318 position += FASTA_LINE_MAX;
321 // write final subsequence
322 m_out << sequence.substr(position) << endl;
326 // print BamAlignment in FASTQ format
327 // N.B. - uses QueryBases NOT AlignedBases
328 void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) {
330 // @BamAlignment.Name
331 // BamAlignment.QueryBases
333 // BamAlignment.Qualities
335 // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand .
336 // Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is.
338 // handle paired-end alignments
339 string name = a.Name;
341 name.append( (a.IsFirstMate() ? "/1" : "/2") );
343 // handle reverse strand alignment - bases & qualities
344 string qualities = a.Qualities;
345 string sequence = a.QueryBases;
346 if ( a.IsReverseStrand() ) {
347 Utilities::Reverse(qualities);
348 Utilities::ReverseComplement(sequence);
351 // write to output stream
352 m_out << "@" << name << endl
355 << qualities << endl;
358 // print BamAlignment in JSON format
359 void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) {
361 // write name & alignment flag
362 m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
364 // write reference name
365 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
366 m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
368 // write position & map quality
369 m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
372 const vector<CigarOp>& cigarData = a.CigarData;
373 if ( !cigarData.empty() ) {
374 m_out << "\"cigar\":[";
375 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
376 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
377 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
378 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
379 const CigarOp& op = (*cigarIter);
380 if (cigarIter != cigarBegin) m_out << ",";
381 m_out << "\"" << op.Length << op.Type << "\"";
386 // write mate reference name, mate position, & insert size
387 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
388 m_out << "\"mate\":{"
389 << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
390 << "\"position\":" << a.MatePosition+1
391 << ",\"insertSize\":" << a.InsertSize << "},";
395 if ( !a.QueryBases.empty() )
396 m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
399 if ( !a.Qualities.empty() ) {
400 string::const_iterator s = a.Qualities.begin();
401 m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
403 for (; s != a.Qualities.end(); ++s) {
404 m_out << "," << static_cast<short>(*s) - 33;
410 const char* tagData = a.TagData.c_str();
411 const size_t tagDataLength = a.TagData.length();
413 if (index < tagDataLength) {
415 m_out << "\"tags\":{";
417 while ( index < tagDataLength ) {
423 m_out << "\"" << a.TagData.substr(index, 2) << "\":";
427 char type = a.TagData.at(index);
432 m_out << "\"" << tagData[index] << "\"";
437 m_out << (int)tagData[index];
442 m_out << (int)tagData[index];
447 m_out << BgzfData::UnpackUnsignedShort(&tagData[index]);
452 m_out << BgzfData::UnpackSignedShort(&tagData[index]);
457 m_out << BgzfData::UnpackUnsignedInt(&tagData[index]);
462 m_out << BgzfData::UnpackSignedInt(&tagData[index]);
467 m_out << BgzfData::UnpackFloat(&tagData[index]);
472 m_out << BgzfData::UnpackDouble(&tagData[index]);
479 while (tagData[index]) {
480 if (tagData[index] == '\"')
481 m_out << "\\\""; // escape for json
483 m_out << tagData[index];
491 if ( tagData[index] == '\0')
498 m_out << "}" << endl;
501 // print BamAlignment in SAM format
502 void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) {
505 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
507 // write name & alignment flag
508 m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
510 // write reference name
511 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
512 m_out << m_references[a.RefID].RefName << "\t";
516 // write position & map quality
517 m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
520 const vector<CigarOp>& cigarData = a.CigarData;
521 if ( cigarData.empty() ) m_out << "*\t";
523 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
524 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
525 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
526 const CigarOp& op = (*cigarIter);
527 m_out << op.Length << op.Type;
532 // write mate reference name, mate position, & insert size
533 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
534 if ( a.MateRefID == a.RefID ) m_out << "=\t";
535 else m_out << m_references[a.MateRefID].RefName << "\t";
536 m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
538 else m_out << "*\t0\t0\t";
541 if ( a.QueryBases.empty() ) m_out << "*\t";
542 else m_out << a.QueryBases << "\t";
545 if ( a.Qualities.empty() ) m_out << "*";
546 else m_out << a.Qualities;
549 const char* tagData = a.TagData.c_str();
550 const size_t tagDataLength = a.TagData.length();
553 while ( index < tagDataLength ) {
556 string tagName = a.TagData.substr(index, 2);
557 m_out << "\t" << tagName << ":";
561 char type = a.TagData.at(index);
565 m_out << "A:" << tagData[index];
570 m_out << "i:" << (int)tagData[index];
575 m_out << "i:" << (int)tagData[index];
580 m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
585 m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
590 m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
595 m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
600 m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
605 m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
611 m_out << type << ":";
612 while (tagData[index]) {
613 m_out << tagData[index];
620 if ( tagData[index] == '\0')
627 void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) {
631 bool ConvertTool::ConvertToolPrivate::RunPileupConversion(BamMultiReader* reader) {
633 // check for valid BamMultiReader
634 if ( reader == 0 ) return false;
636 // set up our pileup format 'visitor'
637 ConvertPileupFormatVisitor* v = new ConvertPileupFormatVisitor(m_references,
638 m_settings->FastaFilename,
639 m_settings->IsPrintingPileupMapQualities,
642 // set up PileupEngine
644 pileup.AddVisitor(v);
646 // iterate through data
648 while ( reader->GetNextAlignment(al) )
649 pileup.AddAlignment(al);
660 // ---------------------------------------------
661 // ConvertTool implementation
663 ConvertTool::ConvertTool(void)
665 , m_settings(new ConvertSettings)
668 // set program details
669 Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format <FORMAT> [-in <filename> -in <filename> ...] [-out <filename>] [other options]");
672 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
673 Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
674 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
675 Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
677 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
678 Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
680 OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options");
681 Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, "");
682 Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts);
684 OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options");
685 Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts);
688 ConvertTool::~ConvertTool(void) {
696 int ConvertTool::Help(void) {
697 Options::DisplayHelp();
701 int ConvertTool::Run(int argc, char* argv[]) {
703 // parse command line arguments
704 Options::Parse(argc, argv, 1);
706 // run internal ConvertTool implementation, return success/fail
707 m_impl = new ConvertToolPrivate(m_settings);
715 // ---------------------------------------------
716 // ConvertPileupFormatVisitor implementation
718 ConvertPileupFormatVisitor::ConvertPileupFormatVisitor(const RefVector& references,
719 const string& fastaFilename,
720 const bool isPrintingMapQualities,
724 , m_isPrintingMapQualities(isPrintingMapQualities)
726 , m_references(references)
728 // set up Fasta reader if file is provided
729 if ( !fastaFilename.empty() ) {
731 // check for FASTA index
732 string indexFilename = "";
733 if ( Utilities::FileExists(fastaFilename + ".fai") )
734 indexFilename = fastaFilename + ".fai";
737 if ( m_fasta.Open(fastaFilename, indexFilename) )
742 ConvertPileupFormatVisitor::~ConvertPileupFormatVisitor(void) {
743 // be sure to close Fasta reader
750 void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData ) {
752 // skip if no alignments at this position
753 if ( pileupData.PileupAlignments.empty() ) return;
755 // retrieve reference name
756 const string& referenceName = m_references[pileupData.RefId].RefName;
757 const int& position = pileupData.Position;
759 // retrieve reference base from FASTA file, if one provided; otherwise default to 'N'
760 char referenceBase('N');
761 if ( m_hasFasta && (pileupData.Position < m_references[pileupData.RefId].RefLength) ) {
762 if ( !m_fasta.GetBase(pileupData.RefId, pileupData.Position, referenceBase ) ) {
763 cerr << "Pileup error : Could not read reference base from FASTA file" << endl;
768 // get count of alleles at this position
769 const int numberAlleles = pileupData.PileupAlignments.size();
771 // -----------------------------------------------------------
772 // build strings based on alleles at this positionInAlignment
774 stringstream bases("");
775 stringstream baseQualities("");
776 stringstream mapQualities("");
778 // iterate over alignments at this pileup position
779 vector<PileupAlignment>::const_iterator pileupIter = pileupData.PileupAlignments.begin();
780 vector<PileupAlignment>::const_iterator pileupEnd = pileupData.PileupAlignments.end();
781 for ( ; pileupIter != pileupEnd; ++pileupIter ) {
782 const PileupAlignment pa = (*pileupIter);
783 const BamAlignment& ba = pa.Alignment;
785 // if beginning of read segment
786 if ( pa.IsSegmentBegin )
787 bases << '^' << ( ((int)ba.MapQuality > 93) ? (char)126 : (char)((int)ba.MapQuality+33) );
789 // if current base is not a DELETION
790 if ( !pa.IsCurrentDeletion ) {
792 // get base at current position
793 char base = ba.QueryBases.at(pa.PositionInAlignment);
795 // if base matches reference
797 toupper(base) == toupper(referenceBase) ||
798 tolower(base) == tolower(referenceBase) )
800 base = (ba.IsReverseStrand() ? ',' : '.' );
803 // mismatches reference
804 else base = (ba.IsReverseStrand() ? tolower(base) : toupper(base) );
809 // if next position contains insertion
810 if ( pa.IsNextInsertion ) {
811 bases << '+' << pa.InsertionLength;
812 for (int i = 1; i <= pa.InsertionLength; ++i) {
813 char insertedBase = (char)ba.QueryBases.at(pa.PositionInAlignment + i);
814 bases << (ba.IsReverseStrand() ? (char)tolower(insertedBase) : (char)toupper(insertedBase) );
818 // if next position contains DELETION
819 else if ( pa.IsNextDeletion ) {
820 bases << '-' << pa.DeletionLength;
821 for (int i = 1; i <= pa.DeletionLength; ++i) {
822 char deletedBase('N');
823 if ( m_hasFasta && (pileupData.Position+i < m_references[pileupData.RefId].RefLength) ) {
824 if ( !m_fasta.GetBase(pileupData.RefId, pileupData.Position+i, deletedBase ) ) {
825 cerr << "Pileup error : Could not read reference base from FASTA file" << endl;
829 bases << (ba.IsReverseStrand() ? (char)tolower(deletedBase) : (char)toupper(deletedBase) );
834 // otherwise, DELETION
837 // if end of read segment
838 if ( pa.IsSegmentEnd ) bases << '$';
840 // store current base quality
841 baseQualities << ba.Qualities.at(pa.PositionInAlignment);
843 // save alignment map quality if desired
844 if ( m_isPrintingMapQualities )
845 mapQualities << ( ((int)ba.MapQuality > 93) ? (char)126 : (char)((int)ba.MapQuality+33) );
848 // ----------------------
852 // <refName> <1-based pos> <refBase> <numberAlleles> <bases> <qualities> [mapQuals]
854 const string TAB = "\t";
855 *m_out << referenceName << TAB
856 << position + 1 << TAB
857 << referenceBase << TAB
858 << numberAlleles << TAB
859 << bases.str() << TAB
860 << baseQualities.str() << TAB
861 << mapQualities.str() << endl;