1 // ***************************************************************************
2 // bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 9 July 2010
7 // ---------------------------------------------------------------------------
8 // Converts between BAM and a number of other formats
9 // ***************************************************************************
17 #include "bamtools_convert.h"
18 #include "bamtools_options.h"
19 #include "bamtools_utilities.h"
21 #include "BamReader.h"
22 #include "BamMultiReader.h"
25 using namespace BamTools;
30 static const string FORMAT_FASTA_LOWER = "fasta";
31 static const string FORMAT_FASTA_UPPER = "FASTA";
32 static const string FORMAT_FASTQ_LOWER = "fastq";
33 static const string FORMAT_FASTQ_UPPER = "FASTQ";
34 static const string FORMAT_JSON_LOWER = "json";
35 static const string FORMAT_JSON_UPPER = "JSON";
36 static const string FORMAT_SAM_LOWER = "sam";
37 static const string FORMAT_SAM_UPPER = "SAM";
40 static const unsigned int FASTA_LINE_MAX = 50;
42 } // namespace BamTools
44 struct ConvertTool::ConvertToolPrivate {
48 ConvertToolPrivate(ConvertTool::ConvertSettings* settings);
49 ~ConvertToolPrivate(void);
57 void PrintFASTA(const BamAlignment& a);
58 void PrintFASTQ(const BamAlignment& a);
59 void PrintJSON(const BamAlignment& a);
60 void PrintSAM(const BamAlignment& a);
64 ConvertTool::ConvertSettings* m_settings;
65 RefVector m_references;
69 // ---------------------------------------------
70 // ConvertSettings implementation
72 struct ConvertTool::ConvertSettings {
75 bool HasInputFilenames;
76 bool HasOutputFilename;
81 vector<string> InputFiles;
82 string OutputFilename;
88 : HasInputFilenames(false)
89 , HasOutputFilename(false)
92 , OutputFilename(Options::StandardOut())
96 // ---------------------------------------------
97 // ConvertTool implementation
99 ConvertTool::ConvertTool(void)
101 , m_settings(new ConvertSettings)
104 // set program details
105 Options::SetProgramInfo("bamtools convert", "converts between BAM and a number of other formats", "-in <filename> -out <filename> -format <FORMAT>");
108 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
109 Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputFilenames, m_settings->InputFiles, IO_Opts, Options::StandardIn());
110 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
111 Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
113 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
114 Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
117 ConvertTool::~ConvertTool(void) {
125 int ConvertTool::Help(void) {
126 Options::DisplayHelp();
130 int ConvertTool::Run(int argc, char* argv[]) {
132 // parse command line arguments
133 Options::Parse(argc, argv, 1);
135 // run internal ConvertTool implementation, return success/fail
136 m_impl = new ConvertToolPrivate(m_settings);
144 // ---------------------------------------------
145 // ConvertToolPrivate implementation
147 ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
148 : m_settings(settings)
149 , m_out(cout.rdbuf()) // default output to cout
152 ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
154 bool ConvertTool::ConvertToolPrivate::Run(void) {
156 bool convertedOk = true;
158 // ------------------------------------
159 // initialize conversion input/output
161 // set to default input if none provided
162 if ( !m_settings->HasInputFilenames )
163 m_settings->InputFiles.push_back(Options::StandardIn());
166 BamMultiReader reader;
167 reader.Open(m_settings->InputFiles);
168 m_references = reader.GetReferenceData();
171 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
172 if ( !reader.SetRegion(region) ) {
173 cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
177 // if an output filename given, open outfile
179 if ( m_settings->HasOutputFilename ) {
180 outFile.open(m_settings->OutputFilename.c_str());
181 if (!outFile) { cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; return false; }
182 m_out.rdbuf(outFile.rdbuf()); // set m_out to file's streambuf
185 // ------------------------
189 if ( m_settings->Format == FORMAT_FASTA_LOWER || m_settings->Format == FORMAT_FASTA_UPPER ) {
190 BamAlignment alignment;
191 while ( reader.GetNextAlignment(alignment) ) {
192 PrintFASTA(alignment);
197 else if ( m_settings->Format == FORMAT_FASTQ_LOWER || m_settings->Format == FORMAT_FASTQ_UPPER ) {
198 BamAlignment alignment;
199 while ( reader.GetNextAlignment(alignment) ) {
200 PrintFASTQ(alignment);
205 else if ( m_settings->Format == FORMAT_JSON_LOWER || m_settings->Format == FORMAT_JSON_UPPER ) {
206 BamAlignment alignment;
207 while ( reader.GetNextAlignment(alignment) ) {
208 PrintJSON(alignment);
213 else if ( m_settings->Format == FORMAT_SAM_LOWER || m_settings->Format == FORMAT_SAM_UPPER ) {
214 BamAlignment alignment;
215 while ( reader.GetNextAlignment(alignment) ) {
222 cerr << "Unrecognized format: " << m_settings->Format << endl;
223 cerr << "Please see help|README (?) for details on supported formats " << endl;
227 // ------------------------
230 if ( m_settings->HasOutputFilename ) outFile.close();
234 // ----------------------------------------------------------
235 // Conversion/output methods
236 // ----------------------------------------------------------
238 // print BamAlignment in FASTA format
239 // N.B. - uses QueryBases NOT AlignedBases
240 void ConvertTool::ConvertToolPrivate::PrintFASTA(const BamAlignment& a) {
242 // >BamAlignment.Name
243 // BamAlignment.QueryBases
247 m_out << "> " << a.Name << endl;
249 // if sequence fits on single line
250 if ( a.QueryBases.length() <= FASTA_LINE_MAX )
251 m_out << a.QueryBases << endl;
253 // else split over multiple lines
257 size_t seqLength = a.QueryBases.length();
259 // write subsequences to each line
260 while ( position < (seqLength - FASTA_LINE_MAX) ) {
261 m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl;
262 position += FASTA_LINE_MAX;
265 // write final subsequence
266 m_out << a.QueryBases.substr(position) << endl;
270 // print BamAlignment in FASTQ format
271 // N.B. - uses QueryBases NOT AlignedBases
272 void ConvertTool::ConvertToolPrivate::PrintFASTQ(const BamAlignment& a) {
274 // @BamAlignment.Name
275 // BamAlignment.QueryBases
277 // BamAlignment.Qualities
279 m_out << "@" << a.Name << endl
280 << a.QueryBases << endl
282 << a.Qualities << endl;
285 // print BamAlignment in JSON format
286 void ConvertTool::ConvertToolPrivate::PrintJSON(const BamAlignment& a) {
288 // write name & alignment flag
289 m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
291 // write reference name
292 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
293 m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
295 // write position & map quality
296 m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
299 const vector<CigarOp>& cigarData = a.CigarData;
300 if ( !cigarData.empty() ) {
301 m_out << "\"cigar\":[";
302 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
303 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
304 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
305 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
306 const CigarOp& op = (*cigarIter);
307 if (cigarIter != cigarBegin) m_out << ",";
308 m_out << "\"" << op.Length << op.Type << "\"";
313 // write mate reference name, mate position, & insert size
314 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
315 m_out << "\"mate\":{"
316 << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
317 << "\"position\":" << a.MatePosition+1
318 << ",\"insertSize\":" << a.InsertSize << "},";
322 if ( !a.QueryBases.empty() )
323 m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
326 if ( !a.Qualities.empty() )
327 m_out << "\"qualities\":\"" << a.Qualities << "\",";
330 const char* tagData = a.TagData.c_str();
331 const size_t tagDataLength = a.TagData.length();
333 if (index < tagDataLength) {
335 m_out << "\"tags\":{";
337 while ( index < tagDataLength ) {
343 m_out << "\"" << a.TagData.substr(index, 2) << "\":";
347 char type = a.TagData.at(index);
352 m_out << "\"" << tagData[index] << "\"";
357 m_out << (int)tagData[index];
362 m_out << (int)tagData[index];
367 m_out << BgzfData::UnpackUnsignedShort(&tagData[index]);
372 m_out << BgzfData::UnpackSignedShort(&tagData[index]);
377 m_out << BgzfData::UnpackUnsignedInt(&tagData[index]);
382 m_out << BgzfData::UnpackSignedInt(&tagData[index]);
387 m_out << BgzfData::UnpackFloat(&tagData[index]);
392 m_out << BgzfData::UnpackDouble(&tagData[index]);
399 while (tagData[index]) {
400 m_out << tagData[index];
408 if ( tagData[index] == '\0')
415 m_out << "}" << endl;
419 // print BamAlignment in SAM format
420 void ConvertTool::ConvertToolPrivate::PrintSAM(const BamAlignment& a) {
423 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
425 // write name & alignment flag
426 m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
428 // write reference name
429 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
430 m_out << m_references[a.RefID].RefName << "\t";
434 // write position & map quality
435 m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
438 const vector<CigarOp>& cigarData = a.CigarData;
439 if ( cigarData.empty() ) m_out << "*\t";
441 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
442 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
443 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
444 const CigarOp& op = (*cigarIter);
445 m_out << op.Length << op.Type;
450 // write mate reference name, mate position, & insert size
451 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
452 if ( a.MateRefID == a.RefID ) m_out << "=\t";
453 else m_out << m_references[a.MateRefID].RefName << "\t";
454 m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
456 else m_out << "*\t0\t0\t";
459 if ( a.QueryBases.empty() ) m_out << "*\t";
460 else m_out << a.QueryBases << "\t";
463 if ( a.Qualities.empty() ) m_out << "*";
464 else m_out << a.Qualities;
467 const char* tagData = a.TagData.c_str();
468 const size_t tagDataLength = a.TagData.length();
471 while ( index < tagDataLength ) {
474 m_out << "\t" << a.TagData.substr(index, 2) << ":";
478 char type = a.TagData.at(index);
483 m_out << "A:" << tagData[index];
488 m_out << "i:" << (int)tagData[index];
493 m_out << "i:" << (int)tagData[index];
498 m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
503 m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
508 m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
513 m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
518 m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
523 m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
529 m_out << type << ":";
530 while (tagData[index]) {
531 m_out << tagData[index];
538 if ( tagData[index] == '\0')