1 // ***************************************************************************
2 // bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 9 July 2010
7 // ---------------------------------------------------------------------------
8 // Converts between BAM and a number of other formats
9 // ***************************************************************************
17 #include "bamtools_convert.h"
18 #include "bamtools_options.h"
20 #include "BamReader.h"
21 #include "BamMultiReader.h"
24 using namespace BamTools;
29 static const string FORMAT_FASTA_LOWER = "fasta";
30 static const string FORMAT_FASTA_UPPER = "FASTA";
31 static const string FORMAT_FASTQ_LOWER = "fastq";
32 static const string FORMAT_FASTQ_UPPER = "FASTQ";
33 static const string FORMAT_JSON_LOWER = "json";
34 static const string FORMAT_JSON_UPPER = "JSON";
35 static const string FORMAT_SAM_LOWER = "sam";
36 static const string FORMAT_SAM_UPPER = "SAM";
39 static const unsigned int FASTA_LINE_MAX = 50;
41 } // namespace BamTools
43 struct ConvertTool::ConvertToolPrivate {
47 ConvertToolPrivate(ConvertTool::ConvertSettings* settings);
48 ~ConvertToolPrivate(void);
56 void PrintFASTA(const BamAlignment& a);
57 void PrintFASTQ(const BamAlignment& a);
58 void PrintJSON(const BamAlignment& a);
59 void PrintSAM(const BamAlignment& a);
63 ConvertTool::ConvertSettings* m_settings;
64 RefVector m_references;
68 // ---------------------------------------------
69 // ConvertSettings implementation
71 struct ConvertTool::ConvertSettings {
74 bool HasInputFilenames;
75 bool HasOutputFilename;
80 vector<string> InputFilenames;
81 string OutputFilename;
87 : HasInputFilenames(false)
88 , HasOutputFilename(false)
91 , OutputFilename(Options::StandardOut())
95 // ---------------------------------------------
96 // ConvertTool implementation
98 ConvertTool::ConvertTool(void)
100 , m_settings(new ConvertSettings)
103 // set program details
104 Options::SetProgramInfo("bamtools convert", "converts between BAM and a number of other formats", "-in <filename> -out <filename> -format <FORMAT>");
107 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
108 Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilenames, m_settings->InputFiles, IO_Opts, Options::StandardIn());
109 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
110 Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
112 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
113 Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
116 ConvertTool::~ConvertTool(void) {
124 int ConvertTool::Help(void) {
125 Options::DisplayHelp();
129 int ConvertTool::Run(int argc, char* argv[]) {
131 // parse command line arguments
132 Options::Parse(argc, argv, 1);
134 // run internal ConvertTool implementation, return success/fail
135 m_impl = new ConvertToolPrivate(m_settings);
143 // ---------------------------------------------
144 // ConvertToolPrivate implementation
146 ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
147 : m_settings(settings)
148 , m_out(cout.rdbuf()) // default output to cout
151 ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
153 bool ConvertTool::ConvertToolPrivate::Run(void) {
155 bool convertedOk = true;
157 // ------------------------------------
158 // initialize conversion input/output
160 // set to default input if none provided
161 if ( !m_settings->HasInputBamFilename )
162 m_settings->InputFiles.push_back(Options::StandardIn());
165 BamMultiReader reader;
166 reader.Open(m_settings->InputFiles);
167 references = reader.GetReferenceData();
170 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
171 if ( !reader.SetRegion(region) ) {
172 cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
176 // if an output filename given, open outfile
178 if ( m_settings->HasOutputFilename ) {
179 outFile.open(m_settings->OutputFilename.c_str());
180 if (!outFile) { cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; return false; }
181 m_out.rdbuf(outFile.rdbuf()); // set m_out to file's streambuf
184 // ------------------------
188 if ( m_settings->Format == FORMAT_FASTA_LOWER || m_settings->Format == FORMAT_FASTA_UPPER ) {
189 BamAlignment alignment;
190 while ( reader.GetNextAlignment(alignment) ) {
191 PrintFASTA(alignment);
196 else if ( m_settings->Format == FORMAT_FASTQ_LOWER || m_settings->Format == FORMAT_FASTQ_UPPER ) {
197 BamAlignment alignment;
198 while ( reader.GetNextAlignment(alignment) ) {
199 PrintFASTQ(alignment);
204 else if ( m_settings->Format == FORMAT_JSON_LOWER || m_settings->Format == FORMAT_JSON_UPPER ) {
205 BamAlignment alignment;
206 while ( reader.GetNextAlignment(alignment) ) {
207 PrintJSON(alignment);
212 else if ( m_settings->Format == FORMAT_SAM_LOWER || m_settings->Format == FORMAT_SAM_UPPER ) {
213 BamAlignment alignment;
214 while ( reader.GetNextAlignment(alignment) ) {
221 cerr << "Unrecognized format: " << m_settings->Format << endl;
222 cerr << "Please see help|README (?) for details on supported formats " << endl;
226 // ------------------------
229 if ( m_settings->HasOutputFilename ) outFile.close();
233 // ----------------------------------------------------------
234 // Conversion/output methods
235 // ----------------------------------------------------------
237 // print BamAlignment in FASTA format
238 // N.B. - uses QueryBases NOT AlignedBases
239 void ConvertTool::ConvertToolPrivate::PrintFASTA(const BamAlignment& a) {
241 // >BamAlignment.Name
242 // BamAlignment.QueryBases
246 m_out << "> " << a.Name << endl;
248 // if sequence fits on single line
249 if ( a.QueryBases.length() <= FASTA_LINE_MAX )
250 m_out << a.QueryBases << endl;
252 // else split over multiple lines
256 size_t seqLength = a.QueryBases.length();
258 // write subsequences to each line
259 while ( position < (seqLength - FASTA_LINE_MAX) ) {
260 m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl;
261 position += FASTA_LINE_MAX;
264 // write final subsequence
265 m_out << a.QueryBases.substr(position) << endl;
269 // print BamAlignment in FASTQ format
270 // N.B. - uses QueryBases NOT AlignedBases
271 void ConvertTool::ConvertToolPrivate::PrintFASTQ(const BamAlignment& a) {
273 // @BamAlignment.Name
274 // BamAlignment.QueryBases
276 // BamAlignment.Qualities
278 m_out << "@" << a.Name << endl
279 << a.QueryBases << endl
281 << a.Qualities << endl;
284 // print BamAlignment in JSON format
285 void ConvertTool::ConvertToolPrivate::PrintJSON(const BamAlignment& a) {
287 // write name & alignment flag
288 m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
290 // write reference name
291 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
292 m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
294 // write position & map quality
295 m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
298 const vector<CigarOp>& cigarData = a.CigarData;
299 if ( !cigarData.empty() ) {
300 m_out << "\"cigar\":[";
301 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
302 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
303 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
304 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
305 const CigarOp& op = (*cigarIter);
306 if (cigarIter != cigarBegin) m_out << ",";
307 m_out << "\"" << op.Length << op.Type << "\"";
312 // write mate reference name, mate position, & insert size
313 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
314 m_out << "\"mate\":{"
315 << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
316 << "\"position\":" << a.MatePosition+1
317 << ",\"insertSize\":" << a.InsertSize << "},";
321 if ( !a.QueryBases.empty() )
322 m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
325 if ( !a.Qualities.empty() )
326 m_out << "\"qualities\":\"" << a.Qualities << "\",";
329 const char* tagData = a.TagData.c_str();
330 const size_t tagDataLength = a.TagData.length();
332 if (index < tagDataLength) {
334 m_out << "\"tags\":{";
336 while ( index < tagDataLength ) {
342 m_out << "\"" << a.TagData.substr(index, 2) << "\":";
346 char type = a.TagData.at(index);
351 m_out << "\"" << tagData[index] << "\"";
356 m_out << (int)tagData[index];
361 m_out << (int)tagData[index];
366 m_out << BgzfData::UnpackUnsignedShort(&tagData[index]);
371 m_out << BgzfData::UnpackSignedShort(&tagData[index]);
376 m_out << BgzfData::UnpackUnsignedInt(&tagData[index]);
381 m_out << BgzfData::UnpackSignedInt(&tagData[index]);
386 m_out << BgzfData::UnpackFloat(&tagData[index]);
391 m_out << BgzfData::UnpackDouble(&tagData[index]);
398 while (tagData[index]) {
399 m_out << tagData[index];
407 if ( tagData[index] == '\0')
414 m_out << "}" << endl;
418 // print BamAlignment in SAM format
419 void ConvertTool::ConvertToolPrivate::PrintSAM(const BamAlignment& a) {
422 // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
424 // write name & alignment flag
425 m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
427 // write reference name
428 if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
429 m_out << m_references[a.RefID].RefName << "\t";
433 // write position & map quality
434 m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
437 const vector<CigarOp>& cigarData = a.CigarData;
438 if ( cigarData.empty() ) m_out << "*\t";
440 vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
441 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
442 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
443 const CigarOp& op = (*cigarIter);
444 m_out << op.Length << op.Type;
449 // write mate reference name, mate position, & insert size
450 if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
451 if ( a.MateRefID == a.RefID ) m_out << "=\t";
452 else m_out << m_references[a.MateRefID].RefName << "\t";
453 m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
455 else m_out << "*\t0\t0\t";
458 if ( a.QueryBases.empty() ) m_out << "*\t";
459 else m_out << a.QueryBases << "\t";
462 if ( a.Qualities.empty() ) m_out << "*";
463 else m_out << a.Qualities;
466 const char* tagData = a.TagData.c_str();
467 const size_t tagDataLength = a.TagData.length();
470 while ( index < tagDataLength ) {
473 m_out << "\t" << a.TagData.substr(index, 2) << ":";
477 char type = a.TagData.at(index);
482 m_out << "A:" << tagData[index];
487 m_out << "i:" << (int)tagData[index];
492 m_out << "i:" << (int)tagData[index];
497 m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
502 m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
507 m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
512 m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
517 m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
522 m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
528 m_out << type << ":";
529 while (tagData[index]) {
530 m_out << tagData[index];
537 if ( tagData[index] == '\0')