1 // ***************************************************************************
2 // bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 June 2010
7 // ---------------------------------------------------------------------------
8 // Converts between BAM and a number of other formats
9 // ***************************************************************************
16 #include "bamtools_convert.h"
17 //#include "bamtools_format.h"
18 #include "bamtools_options.h"
20 #include "BamReader.h"
21 #include "BamMultiReader.h"
24 using namespace BamTools;
26 static RefVector references;
30 static const string FORMAT_FASTA = "fasta";
31 static const string FORMAT_FASTQ = "fastq";
32 static const string FORMAT_JSON = "json";
33 static const string FORMAT_SAM = "sam";
35 void PrintFASTA(ostream& out, const BamAlignment& a);
36 void PrintFASTQ(ostream& out, const BamAlignment& a);
37 void PrintJSON(ostream& out, const BamAlignment& a);
38 void PrintSAM(ostream& out, const BamAlignment& a);
40 } // namespace BamTools
42 // ---------------------------------------------
43 // ConvertSettings implementation
45 struct ConvertTool::ConvertSettings {
48 bool HasInputBamFilename;
49 bool HasOutputBamFilename;
54 string OutputFilename;
59 : HasInputBamFilename(false)
60 , HasOutputBamFilename(false)
61 , InputFilename(Options::StandardIn())
62 , OutputFilename(Options::StandardOut())
66 // ---------------------------------------------
67 // ConvertTool implementation
69 ConvertTool::ConvertTool(void)
71 , m_settings(new ConvertSettings)
73 // set program details
74 Options::SetProgramInfo("bamtools convert", "converts between BAM and a number of other formats", "-in <filename> -out <filename> -format <FORMAT>");
77 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
78 Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
79 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
80 Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
83 ConvertTool::~ConvertTool(void) {
88 int ConvertTool::Help(void) {
89 Options::DisplayHelp();
93 int ConvertTool::Run(int argc, char* argv[]) {
95 bool convertedOk = true;
97 // parse command line arguments
98 Options::Parse(argc, argv, 1);
102 reader.Open(m_settings->InputFilename);
103 references = reader.GetReferenceData();
105 // ----------------------------------------
106 // do conversion,depending on desired output format
109 if ( m_settings->Format == FORMAT_FASTA ) {
110 //cout << "Converting to FASTA" << endl;
114 else if ( m_settings->Format == FORMAT_FASTQ) {
115 //cout << "Converting to FASTQ" << endl;
119 else if ( m_settings->Format == FORMAT_JSON ) {
120 //cout << "Converting to JSON" << endl;
121 BamAlignment alignment;
122 while ( reader.GetNextAlignment(alignment) ) {
123 PrintJSON(cout, alignment);
129 else if ( m_settings->Format == FORMAT_SAM ) {
130 BamAlignment alignment;
131 while ( reader.GetNextAlignment(alignment) ) {
132 PrintSAM(cout, alignment);
136 // uncrecognized format
138 cerr << "Unrecognized format: " << m_settings->Format << endl;
139 cerr << "Please see help|README (?) for details on supported formats " << endl;
143 // ------------------------
146 return (int)convertedOk;
149 // ----------------------------------------------------------
150 // Conversion/output methods
151 // ----------------------------------------------------------
153 // print BamAlignment in FASTA format
154 void BamTools::PrintFASTA(ostream& out, const BamAlignment& a) {
158 // print BamAlignment in FASTQ format
159 void BamTools::PrintFASTQ(ostream& out, const BamAlignment& a) {
163 // print BamAlignment in JSON format
164 void BamTools::PrintJSON(ostream& out, const BamAlignment& a) {
167 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
169 // write name & alignment flag
170 out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\""
171 << a.AlignmentFlag << "\",";
173 // write reference name
174 if ( (a.RefID >= 0) && (a.RefID < (int)references.size()) ) out << "\"reference\":\"" << references[a.RefID].RefName << "\",";
177 // write position & map quality
178 out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
181 const vector<CigarOp>& cigarData = a.CigarData;
182 if ( !cigarData.empty() ) {
183 out << "\"cigar\":[";
184 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
185 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
186 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
187 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
188 const CigarOp& op = (*cigarIter);
189 if (cigarIter != cigarBegin) out << ",";
190 out << "[\"" << op.Length << ",\"" << op.Type << "\"]";
195 // write mate reference name, mate position, & insert size
196 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)references.size()) ) {
198 << "\"reference\":\"" << references[a.MateRefID].RefName << "\","
199 << "\"position\":" << a.MatePosition+1
200 << ",\"insertSize\":" << a.InsertSize << "},";
204 if ( !a.QueryBases.empty() ) {
205 out << "\"queryBases\":\"" << a.QueryBases << "\",";
209 if ( !a.Qualities.empty() ) {
210 out << "\"qualities\":\"" << a.Qualities << "\",";
214 const char* tagData = a.TagData.c_str();
215 const size_t tagDataLength = a.TagData.length();
217 if (index < tagDataLength) {
221 while ( index < tagDataLength ) {
227 out << "\"" << a.TagData.substr(index, 2) << "\":";
231 char type = a.TagData.at(index);
236 out << "\"" << tagData[index] << "\"";
241 out << (int)tagData[index];
246 out << (int)tagData[index];
251 out << BgzfData::UnpackUnsignedShort(&tagData[index]);
256 out << BgzfData::UnpackSignedShort(&tagData[index]);
261 out << BgzfData::UnpackUnsignedInt(&tagData[index]);
266 out << BgzfData::UnpackSignedInt(&tagData[index]);
271 out << BgzfData::UnpackFloat(&tagData[index]);
276 out << BgzfData::UnpackDouble(&tagData[index]);
283 while (tagData[index]) {
284 out << tagData[index];
292 if ( tagData[index] == '\0') break;
302 // print BamAlignment in SAM format
303 void BamTools::PrintSAM(ostream& out, const BamAlignment& a) {
306 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
308 // write name & alignment flag
309 out << a.Name << "\t" << a.AlignmentFlag << "\t";
311 // write reference name
312 if ( (a.RefID >= 0) && (a.RefID < (int)references.size()) ) out << references[a.RefID].RefName << "\t";
315 // write position & map quality
316 out << a.Position+1 << "\t" << a.MapQuality << "\t";
319 const vector<CigarOp>& cigarData = a.CigarData;
320 if ( cigarData.empty() ) out << "*\t";
322 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
323 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
324 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
325 const CigarOp& op = (*cigarIter);
326 out << op.Length << op.Type;
331 // write mate reference name, mate position, & insert size
332 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)references.size()) ) {
333 if ( a.MateRefID == a.RefID ) out << "=\t";
334 else out << references[a.MateRefID].RefName << "\t";
335 out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
337 else out << "*\t0\t0\t";
340 if ( a.QueryBases.empty() ) out << "*\t";
341 else out << a.QueryBases << "\t";
344 if ( a.Qualities.empty() ) out << "*";
345 else out << a.Qualities;
348 const char* tagData = a.TagData.c_str();
349 const size_t tagDataLength = a.TagData.length();
352 while ( index < tagDataLength ) {
355 out << "\t" << a.TagData.substr(index, 2) << ":";
359 char type = a.TagData.at(index);
364 out << "A:" << tagData[index];
369 out << "i:" << (int)tagData[index];
374 out << "i:" << (int)tagData[index];
379 out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
384 out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
389 out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
394 out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
399 out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
404 out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
411 while (tagData[index]) {
412 out << tagData[index];
419 if ( tagData[index] == '\0') break;