From: Derek Date: Fri, 9 Jul 2010 16:57:55 +0000 (-0400) Subject: Reorganized convert tool code. Restored stdin by default. Implemented FASTA/FASTQ... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=f7ce1f11dc76fe0d74e0bf44e98c06017ae25898;p=bamtools.git Reorganized convert tool code. Restored stdin by default. Implemented FASTA/FASTQ convert methods. Still need to include support for new (.bti) index file format --- diff --git a/bamtools_convert.cpp b/bamtools_convert.cpp index 633c21d..a2f16b8 100644 --- a/bamtools_convert.cpp +++ b/bamtools_convert.cpp @@ -3,20 +3,19 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 7 June 2010 +// Last modified: 9 July 2010 // --------------------------------------------------------------------------- // Converts between BAM and a number of other formats // *************************************************************************** +#include #include #include #include #include #include "bamtools_convert.h" -//#include "bamtools_format.h" #include "bamtools_options.h" -#include "bamtools_utilities.h" #include "BGZF.h" #include "BamReader.h" #include "BamMultiReader.h" @@ -24,47 +23,71 @@ using namespace std; using namespace BamTools; -static RefVector references; +namespace BamTools { -namespace BamTools { - - static const string FORMAT_FASTA = "fasta"; - static const string FORMAT_FASTQ = "fastq"; - static const string FORMAT_JSON = "json"; - static const string FORMAT_SAM = "sam"; - - void PrintFASTA(ostream& out, const BamAlignment& a); - void PrintFASTQ(ostream& out, const BamAlignment& a); - void PrintJSON(ostream& out, const BamAlignment& a); - void PrintSAM(ostream& out, const BamAlignment& a); + // format names + static const string FORMAT_FASTA_LOWER = "fasta"; + static const string FORMAT_FASTA_UPPER = "FASTA"; + static const string FORMAT_FASTQ_LOWER = "fastq"; + static const string FORMAT_FASTQ_UPPER = "FASTQ"; + static const string FORMAT_JSON_LOWER = "json"; + static const string FORMAT_JSON_UPPER = "JSON"; + static const string FORMAT_SAM_LOWER = "sam"; + static const string FORMAT_SAM_UPPER = "SAM"; + + // other constants + static const unsigned int FASTA_LINE_MAX = 50; } // namespace BamTools +struct ConvertTool::ConvertToolPrivate { + + // ctor & dtor + public: + ConvertToolPrivate(ConvertTool::ConvertSettings* settings); + ~ConvertToolPrivate(void); + + // interface + public: + bool Run(void); + + // internal methods + private: + void PrintFASTA(const BamAlignment& a); + void PrintFASTQ(const BamAlignment& a); + void PrintJSON(const BamAlignment& a); + void PrintSAM(const BamAlignment& a); + + // data members + private: + ConvertTool::ConvertSettings* m_settings; + RefVector m_references; + ostream m_out; +}; + // --------------------------------------------- // ConvertSettings implementation struct ConvertTool::ConvertSettings { // flags - bool HasInputBamFilenames; - bool HasOutputBamFilename; + bool HasInputFilenames; + bool HasOutputFilename; bool HasFormat; bool HasRegion; - // filenames - //string InputFilename; - vector InputFiles; + // options + vector InputFilenames; string OutputFilename; string Format; - - // region string Region; - + // constructor ConvertSettings(void) - : HasInputBamFilenames(false) - , HasOutputBamFilename(false) - //, InputFilename(Options::StandardIn()) + : HasInputFilenames(false) + , HasOutputFilename(false) + , HasFormat(false) + , HasRegion(false) , OutputFilename(Options::StandardOut()) { } }; @@ -75,16 +98,17 @@ struct ConvertTool::ConvertSettings { ConvertTool::ConvertTool(void) : AbstractTool() , m_settings(new ConvertSettings) + , m_impl(0) { // set program details Options::SetProgramInfo("bamtools convert", "converts between BAM and a number of other formats", "-in -out -format "); // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - //Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilenames, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilenames, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts); + OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); } @@ -92,6 +116,9 @@ ConvertTool::ConvertTool(void) ConvertTool::~ConvertTool(void) { delete m_settings; m_settings = 0; + + delete m_impl; + m_impl = 0; } int ConvertTool::Help(void) { @@ -101,11 +128,39 @@ int ConvertTool::Help(void) { int ConvertTool::Run(int argc, char* argv[]) { - bool convertedOk = true; - // parse command line arguments Options::Parse(argc, argv, 1); + // run internal ConvertTool implementation, return success/fail + m_impl = new ConvertToolPrivate(m_settings); + + if ( m_impl->Run() ) + return 0; + else + return 1; +} + +// --------------------------------------------- +// ConvertToolPrivate implementation + +ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings) + : m_settings(settings) + , m_out(cout.rdbuf()) // default output to cout +{ } + +ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { } + +bool ConvertTool::ConvertToolPrivate::Run(void) { + + bool convertedOk = true; + + // ------------------------------------ + // initialize conversion input/output + + // set to default input if none provided + if ( !m_settings->HasInputBamFilename ) + m_settings->InputFiles.push_back(Options::StandardIn()); + // open files BamMultiReader reader; reader.Open(m_settings->InputFiles); @@ -118,39 +173,51 @@ int ConvertTool::Run(int argc, char* argv[]) { } } - // ---------------------------------------- - // do conversion,depending on desired output format - + // if an output filename given, open outfile + ofstream outFile; + if ( m_settings->HasOutputFilename ) { + outFile.open(m_settings->OutputFilename.c_str()); + if (!outFile) { cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; return false; } + m_out.rdbuf(outFile.rdbuf()); // set m_out to file's streambuf + } + + // ------------------------ + // do conversion + // FASTA - if ( m_settings->Format == FORMAT_FASTA ) { - //cout << "Converting to FASTA" << endl; + if ( m_settings->Format == FORMAT_FASTA_LOWER || m_settings->Format == FORMAT_FASTA_UPPER ) { + BamAlignment alignment; + while ( reader.GetNextAlignment(alignment) ) { + PrintFASTA(alignment); + } } // FASTQ - else if ( m_settings->Format == FORMAT_FASTQ) { - //cout << "Converting to FASTQ" << endl; + else if ( m_settings->Format == FORMAT_FASTQ_LOWER || m_settings->Format == FORMAT_FASTQ_UPPER ) { + BamAlignment alignment; + while ( reader.GetNextAlignment(alignment) ) { + PrintFASTQ(alignment); + } } - // JSON - else if ( m_settings->Format == FORMAT_JSON ) { - //cout << "Converting to JSON" << endl; + // JSON + else if ( m_settings->Format == FORMAT_JSON_LOWER || m_settings->Format == FORMAT_JSON_UPPER ) { BamAlignment alignment; while ( reader.GetNextAlignment(alignment) ) { - PrintJSON(cout, alignment); + PrintJSON(alignment); } - } // SAM - else if ( m_settings->Format == FORMAT_SAM ) { + else if ( m_settings->Format == FORMAT_SAM_LOWER || m_settings->Format == FORMAT_SAM_UPPER ) { BamAlignment alignment; while ( reader.GetNextAlignment(alignment) ) { - PrintSAM(cout, alignment); + PrintSAM(alignment); } } - // uncrecognized format - else { + // error + else { cerr << "Unrecognized format: " << m_settings->Format << endl; cerr << "Please see help|README (?) for details on supported formats " << endl; convertedOk = false; @@ -159,7 +226,8 @@ int ConvertTool::Run(int argc, char* argv[]) { // ------------------------ // clean up & exit reader.Close(); - return (int)convertedOk; + if ( m_settings->HasOutputFilename ) outFile.close(); + return convertedOk; } // ---------------------------------------------------------- @@ -167,64 +235,95 @@ int ConvertTool::Run(int argc, char* argv[]) { // ---------------------------------------------------------- // print BamAlignment in FASTA format -void BamTools::PrintFASTA(ostream& out, const BamAlignment& a) { - +// N.B. - uses QueryBases NOT AlignedBases +void ConvertTool::ConvertToolPrivate::PrintFASTA(const BamAlignment& a) { + + // >BamAlignment.Name + // BamAlignment.QueryBases + // ... + + // print header + m_out << "> " << a.Name << endl; + + // if sequence fits on single line + if ( a.QueryBases.length() <= FASTA_LINE_MAX ) + m_out << a.QueryBases << endl; + + // else split over multiple lines + else { + + size_t position = 0; + size_t seqLength = a.QueryBases.length(); + + // write subsequences to each line + while ( position < (seqLength - FASTA_LINE_MAX) ) { + m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl; + position += FASTA_LINE_MAX; + } + + // write final subsequence + m_out << a.QueryBases.substr(position) << endl; + } } // print BamAlignment in FASTQ format -void BamTools::PrintFASTQ(ostream& out, const BamAlignment& a) { - +// N.B. - uses QueryBases NOT AlignedBases +void ConvertTool::ConvertToolPrivate::PrintFASTQ(const BamAlignment& a) { + + // @BamAlignment.Name + // BamAlignment.QueryBases + // + + // BamAlignment.Qualities + + m_out << "@" << a.Name << endl + << a.QueryBases << endl + << "+" << endl + << a.Qualities << endl; } // print BamAlignment in JSON format -void BamTools::PrintJSON(ostream& out, const BamAlignment& a) { - - // tab-delimited - // [ :: [...] ] +void ConvertTool::ConvertToolPrivate::PrintJSON(const BamAlignment& a) { // write name & alignment flag - out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" - << a.AlignmentFlag << "\","; + m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\","; // write reference name - if ( (a.RefID >= 0) && (a.RefID < (int)references.size()) ) out << "\"reference\":\"" << references[a.RefID].RefName << "\","; - //else out << "*\t"; + if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) + m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\","; // write position & map quality - out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ","; + m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ","; // write CIGAR const vector& cigarData = a.CigarData; if ( !cigarData.empty() ) { - out << "\"cigar\":["; + m_out << "\"cigar\":["; vector::const_iterator cigarBegin = cigarData.begin(); vector::const_iterator cigarIter = cigarBegin; vector::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); - if (cigarIter != cigarBegin) out << ","; - out << "\"" << op.Length << op.Type << "\""; + if (cigarIter != cigarBegin) m_out << ","; + m_out << "\"" << op.Length << op.Type << "\""; } - out << "],"; + m_out << "],"; } // write mate reference name, mate position, & insert size - if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)references.size()) ) { - out << "\"mate\":{" - << "\"reference\":\"" << references[a.MateRefID].RefName << "\"," - << "\"position\":" << a.MatePosition+1 - << ",\"insertSize\":" << a.InsertSize << "},"; + if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { + m_out << "\"mate\":{" + << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\"," + << "\"position\":" << a.MatePosition+1 + << ",\"insertSize\":" << a.InsertSize << "},"; } // write sequence - if ( !a.QueryBases.empty() ) { - out << "\"queryBases\":\"" << a.QueryBases << "\","; - } + if ( !a.QueryBases.empty() ) + m_out << "\"queryBases\":\"" << a.QueryBases << "\","; // write qualities - if ( !a.Qualities.empty() ) { - out << "\"qualities\":\"" << a.Qualities << "\","; - } + if ( !a.Qualities.empty() ) + m_out << "\"qualities\":\"" << a.Qualities << "\","; // write tag data const char* tagData = a.TagData.c_str(); @@ -232,15 +331,15 @@ void BamTools::PrintJSON(ostream& out, const BamAlignment& a) { size_t index = 0; if (index < tagDataLength) { - out << "\"tags\":{"; + m_out << "\"tags\":{"; while ( index < tagDataLength ) { if (index > 0) - out << ","; + m_out << ","; // write tag name - out << "\"" << a.TagData.substr(index, 2) << "\":"; + m_out << "\"" << a.TagData.substr(index, 2) << "\":"; index += 2; // get data type @@ -249,116 +348,119 @@ void BamTools::PrintJSON(ostream& out, const BamAlignment& a) { switch (type) { case('A') : - out << "\"" << tagData[index] << "\""; + m_out << "\"" << tagData[index] << "\""; ++index; break; case('C') : - out << (int)tagData[index]; + m_out << (int)tagData[index]; ++index; break; case('c') : - out << (int)tagData[index]; + m_out << (int)tagData[index]; ++index; break; case('S') : - out << BgzfData::UnpackUnsignedShort(&tagData[index]); + m_out << BgzfData::UnpackUnsignedShort(&tagData[index]); index += 2; break; case('s') : - out << BgzfData::UnpackSignedShort(&tagData[index]); + m_out << BgzfData::UnpackSignedShort(&tagData[index]); index += 2; break; case('I') : - out << BgzfData::UnpackUnsignedInt(&tagData[index]); + m_out << BgzfData::UnpackUnsignedInt(&tagData[index]); index += 4; break; case('i') : - out << BgzfData::UnpackSignedInt(&tagData[index]); + m_out << BgzfData::UnpackSignedInt(&tagData[index]); index += 4; break; case('f') : - out << BgzfData::UnpackFloat(&tagData[index]); + m_out << BgzfData::UnpackFloat(&tagData[index]); index += 4; break; case('d') : - out << BgzfData::UnpackDouble(&tagData[index]); + m_out << BgzfData::UnpackDouble(&tagData[index]); index += 8; break; case('Z') : case('H') : - out << "\""; + m_out << "\""; while (tagData[index]) { - out << tagData[index]; + m_out << tagData[index]; ++index; } - out << "\""; + m_out << "\""; ++index; break; } - if ( tagData[index] == '\0') break; + if ( tagData[index] == '\0') + break; } - out << "}"; + m_out << "}"; } - out << "}" << endl; + m_out << "}" << endl; } // print BamAlignment in SAM format -void BamTools::PrintSAM(ostream& out, const BamAlignment& a) { +void ConvertTool::ConvertToolPrivate::PrintSAM(const BamAlignment& a) { // tab-delimited // [ :: [...] ] // write name & alignment flag - out << a.Name << "\t" << a.AlignmentFlag << "\t"; + m_out << a.Name << "\t" << a.AlignmentFlag << "\t"; // write reference name - if ( (a.RefID >= 0) && (a.RefID < (int)references.size()) ) out << references[a.RefID].RefName << "\t"; - else out << "*\t"; + if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) + m_out << m_references[a.RefID].RefName << "\t"; + else + m_out << "*\t"; // write position & map quality - out << a.Position+1 << "\t" << a.MapQuality << "\t"; + m_out << a.Position+1 << "\t" << a.MapQuality << "\t"; // write CIGAR const vector& cigarData = a.CigarData; - if ( cigarData.empty() ) out << "*\t"; + if ( cigarData.empty() ) m_out << "*\t"; else { vector::const_iterator cigarIter = cigarData.begin(); vector::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); - out << op.Length << op.Type; + m_out << op.Length << op.Type; } - out << "\t"; + m_out << "\t"; } // write mate reference name, mate position, & insert size - if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)references.size()) ) { - if ( a.MateRefID == a.RefID ) out << "=\t"; - else out << references[a.MateRefID].RefName << "\t"; - out << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; + if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { + if ( a.MateRefID == a.RefID ) m_out << "=\t"; + else m_out << m_references[a.MateRefID].RefName << "\t"; + m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; } - else out << "*\t0\t0\t"; + else m_out << "*\t0\t0\t"; // write sequence - if ( a.QueryBases.empty() ) out << "*\t"; - else out << a.QueryBases << "\t"; + if ( a.QueryBases.empty() ) m_out << "*\t"; + else m_out << a.QueryBases << "\t"; // write qualities - if ( a.Qualities.empty() ) out << "*"; - else out << a.Qualities; + if ( a.Qualities.empty() ) m_out << "*"; + else m_out << a.Qualities; // write tag data const char* tagData = a.TagData.c_str(); @@ -368,7 +470,7 @@ void BamTools::PrintSAM(ostream& out, const BamAlignment& a) { while ( index < tagDataLength ) { // write tag name - out << "\t" << a.TagData.substr(index, 2) << ":"; + m_out << "\t" << a.TagData.substr(index, 2) << ":"; index += 2; // get data type @@ -377,63 +479,64 @@ void BamTools::PrintSAM(ostream& out, const BamAlignment& a) { switch (type) { case('A') : - out << "A:" << tagData[index]; + m_out << "A:" << tagData[index]; ++index; break; case('C') : - out << "i:" << (int)tagData[index]; + m_out << "i:" << (int)tagData[index]; ++index; break; case('c') : - out << "i:" << (int)tagData[index]; + m_out << "i:" << (int)tagData[index]; ++index; break; case('S') : - out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]); + m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]); index += 2; break; case('s') : - out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]); + m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]); index += 2; break; case('I') : - out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]); + m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]); index += 4; break; case('i') : - out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]); + m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]); index += 4; break; case('f') : - out << "f:" << BgzfData::UnpackFloat(&tagData[index]); + m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]); index += 4; break; case('d') : - out << "d:" << BgzfData::UnpackDouble(&tagData[index]); + m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]); index += 8; break; case('Z') : case('H') : - out << type << ":"; + m_out << type << ":"; while (tagData[index]) { - out << tagData[index]; + m_out << tagData[index]; ++index; } ++index; break; } - if ( tagData[index] == '\0') break; + if ( tagData[index] == '\0') + break; } - out << endl; + m_out << endl; } diff --git a/bamtools_convert.h b/bamtools_convert.h index 8595dab..8dd6857 100644 --- a/bamtools_convert.h +++ b/bamtools_convert.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 2 June 2010 +// Last modified: 9 July 2010 // --------------------------------------------------------------------------- // Converts between BAM and a number of other formats // *************************************************************************** @@ -28,6 +28,9 @@ class ConvertTool : public AbstractTool { private: struct ConvertSettings; ConvertSettings* m_settings; + + struct ConvertToolPrivate; + ConvertToolPrivate* m_impl; }; } // namespace BamTools