1 // ***************************************************************************
2 // bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 June 2010
7 // ---------------------------------------------------------------------------
8 // Converts between BAM and a number of other formats
9 // ***************************************************************************
16 #include "bamtools_convert.h"
17 //#include "bamtools_format.h"
18 #include "bamtools_options.h"
19 #include "bamtools_utilities.h"
21 #include "BamReader.h"
22 #include "BamMultiReader.h"
25 using namespace BamTools;
27 static RefVector references;
31 static const string FORMAT_FASTA = "fasta";
32 static const string FORMAT_FASTQ = "fastq";
33 static const string FORMAT_JSON = "json";
34 static const string FORMAT_SAM = "sam";
36 void PrintFASTA(ostream& out, const BamAlignment& a);
37 void PrintFASTQ(ostream& out, const BamAlignment& a);
38 void PrintJSON(ostream& out, const BamAlignment& a);
39 void PrintSAM(ostream& out, const BamAlignment& a);
41 } // namespace BamTools
43 // ---------------------------------------------
44 // ConvertSettings implementation
46 struct ConvertTool::ConvertSettings {
49 bool HasInputBamFilenames;
50 bool HasOutputBamFilename;
55 //string InputFilename;
56 vector<string> InputFiles;
57 string OutputFilename;
65 : HasInputBamFilenames(false)
66 , HasOutputBamFilename(false)
67 //, InputFilename(Options::StandardIn())
68 , OutputFilename(Options::StandardOut())
72 // ---------------------------------------------
73 // ConvertTool implementation
75 ConvertTool::ConvertTool(void)
77 , m_settings(new ConvertSettings)
79 // set program details
80 Options::SetProgramInfo("bamtools convert", "converts between BAM and a number of other formats", "-in <filename> -out <filename> -format <FORMAT>");
83 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
84 //Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
85 Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilenames, m_settings->InputFiles, IO_Opts, Options::StandardIn());
86 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
87 Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
88 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
89 Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
92 ConvertTool::~ConvertTool(void) {
97 int ConvertTool::Help(void) {
98 Options::DisplayHelp();
102 int ConvertTool::Run(int argc, char* argv[]) {
104 bool convertedOk = true;
106 // parse command line arguments
107 Options::Parse(argc, argv, 1);
110 BamMultiReader reader;
111 reader.Open(m_settings->InputFiles);
112 references = reader.GetReferenceData();
115 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
116 if ( !reader.SetRegion(region) ) {
117 cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
121 // ----------------------------------------
122 // do conversion,depending on desired output format
125 if ( m_settings->Format == FORMAT_FASTA ) {
126 //cout << "Converting to FASTA" << endl;
130 else if ( m_settings->Format == FORMAT_FASTQ) {
131 //cout << "Converting to FASTQ" << endl;
135 else if ( m_settings->Format == FORMAT_JSON ) {
136 //cout << "Converting to JSON" << endl;
137 BamAlignment alignment;
138 while ( reader.GetNextAlignment(alignment) ) {
139 PrintJSON(cout, alignment);
145 else if ( m_settings->Format == FORMAT_SAM ) {
146 BamAlignment alignment;
147 while ( reader.GetNextAlignment(alignment) ) {
148 PrintSAM(cout, alignment);
152 // uncrecognized format
154 cerr << "Unrecognized format: " << m_settings->Format << endl;
155 cerr << "Please see help|README (?) for details on supported formats " << endl;
159 // ------------------------
162 return (int)convertedOk;
165 // ----------------------------------------------------------
166 // Conversion/output methods
167 // ----------------------------------------------------------
169 // print BamAlignment in FASTA format
170 void BamTools::PrintFASTA(ostream& out, const BamAlignment& a) {
174 // print BamAlignment in FASTQ format
175 void BamTools::PrintFASTQ(ostream& out, const BamAlignment& a) {
179 // print BamAlignment in JSON format
180 void BamTools::PrintJSON(ostream& out, const BamAlignment& a) {
183 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
185 // write name & alignment flag
186 out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\""
187 << a.AlignmentFlag << "\",";
189 // write reference name
190 if ( (a.RefID >= 0) && (a.RefID < (int)references.size()) ) out << "\"reference\":\"" << references[a.RefID].RefName << "\",";
193 // write position & map quality
194 out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
197 const vector<CigarOp>& cigarData = a.CigarData;
198 if ( !cigarData.empty() ) {
199 out << "\"cigar\":[";
200 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
201 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
202 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
203 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
204 const CigarOp& op = (*cigarIter);
205 if (cigarIter != cigarBegin) out << ",";
206 out << "\"" << op.Length << op.Type << "\"";
211 // write mate reference name, mate position, & insert size
212 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)references.size()) ) {
214 << "\"reference\":\"" << references[a.MateRefID].RefName << "\","
215 << "\"position\":" << a.MatePosition+1
216 << ",\"insertSize\":" << a.InsertSize << "},";
220 if ( !a.QueryBases.empty() ) {
221 out << "\"queryBases\":\"" << a.QueryBases << "\",";
225 if ( !a.Qualities.empty() ) {
226 out << "\"qualities\":\"" << a.Qualities << "\",";
230 const char* tagData = a.TagData.c_str();
231 const size_t tagDataLength = a.TagData.length();
233 if (index < tagDataLength) {
237 while ( index < tagDataLength ) {
243 out << "\"" << a.TagData.substr(index, 2) << "\":";
247 char type = a.TagData.at(index);
252 out << "\"" << tagData[index] << "\"";
257 out << (int)tagData[index];
262 out << (int)tagData[index];
267 out << BgzfData::UnpackUnsignedShort(&tagData[index]);
272 out << BgzfData::UnpackSignedShort(&tagData[index]);
277 out << BgzfData::UnpackUnsignedInt(&tagData[index]);
282 out << BgzfData::UnpackSignedInt(&tagData[index]);
287 out << BgzfData::UnpackFloat(&tagData[index]);
292 out << BgzfData::UnpackDouble(&tagData[index]);
299 while (tagData[index]) {
300 out << tagData[index];
308 if ( tagData[index] == '\0') break;
318 // print BamAlignment in SAM format
319 void BamTools::PrintSAM(ostream& out, const BamAlignment& a) {
322 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
324 // write name & alignment flag
325 out << a.Name << "\t" << a.AlignmentFlag << "\t";
327 // write reference name
328 if ( (a.RefID >= 0) && (a.RefID < (int)references.size()) ) out << references[a.RefID].RefName << "\t";
331 // write position & map quality
332 out << a.Position+1 << "\t" << a.MapQuality << "\t";
335 const vector<CigarOp>& cigarData = a.CigarData;
336 if ( cigarData.empty() ) out << "*\t";
338 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
339 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
340 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
341 const CigarOp& op = (*cigarIter);
342 out << op.Length << op.Type;
347 // write mate reference name, mate position, & insert size
348 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)references.size()) ) {
349 if ( a.MateRefID == a.RefID ) out << "=\t";
350 else out << references[a.MateRefID].RefName << "\t";
351 out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
353 else out << "*\t0\t0\t";
356 if ( a.QueryBases.empty() ) out << "*\t";
357 else out << a.QueryBases << "\t";
360 if ( a.Qualities.empty() ) out << "*";
361 else out << a.Qualities;
364 const char* tagData = a.TagData.c_str();
365 const size_t tagDataLength = a.TagData.length();
368 while ( index < tagDataLength ) {
371 out << "\t" << a.TagData.substr(index, 2) << ":";
375 char type = a.TagData.at(index);
380 out << "A:" << tagData[index];
385 out << "i:" << (int)tagData[index];
390 out << "i:" << (int)tagData[index];
395 out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
400 out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
405 out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
410 out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
415 out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
420 out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
427 while (tagData[index]) {
428 out << tagData[index];
435 if ( tagData[index] == '\0') break;