From d37b45d132eb330a2c918c59e9ecb78168a4ac47 Mon Sep 17 00:00:00 2001 From: Derek Date: Wed, 26 May 2010 16:05:15 -0400 Subject: [PATCH] Reorganization of toolkit. Split subtools out to own headers. Added custom getopt functionality for subtools arguments. Provided or extended rough implementations for most subtools. --- BamReader.cpp | 8 +- Makefile | 18 +- README | 58 ++++- bamtools.cpp | 232 ++++++++----------- bamtools_coverage.h | 59 +++++ bamtools_dump.h | 73 ++++++ bamtools_getopt.h | 551 ++++++++++++++++++++++++++++++++++++++++++++ bamtools_header.h | 59 +++++ bamtools_index.h | 59 +++++ bamtools_merge.h | 80 +++++++ bamtools_sam.h | 144 ++++++++++++ bamtools_sort.h | 70 ++++++ bamtools_stats.h | 59 +++++ 13 files changed, 1309 insertions(+), 161 deletions(-) create mode 100644 bamtools_coverage.h create mode 100644 bamtools_dump.h create mode 100644 bamtools_getopt.h create mode 100644 bamtools_header.h create mode 100644 bamtools_index.h create mode 100644 bamtools_merge.h create mode 100644 bamtools_sam.h create mode 100644 bamtools_sort.h create mode 100644 bamtools_stats.h diff --git a/BamReader.cpp b/BamReader.cpp index 482fc26..7213b23 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -48,7 +48,7 @@ struct BamReader::BamReaderPrivate { int64_t AlignmentsBeginOffset; string Filename; string IndexFilename; - + // system data bool IsBigEndian; @@ -154,7 +154,7 @@ const string BamReader::GetHeaderText(void) const { return d->HeaderText; } int BamReader::GetReferenceCount(void) const { return d->References.size(); } const RefVector BamReader::GetReferenceData(void) const { return d->References; } int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } -const std::string BamReader::GetFilename(void) const { return d->Filename; } +const std::string BamReader::GetFilename(void) const { return d->Filename; } // index operations bool BamReader::CreateIndex(void) { return d->CreateIndex(); } @@ -615,7 +615,7 @@ void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets const uint64_t& lastOffset) { // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; // resize vector if necessary @@ -640,7 +640,7 @@ bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // read starts after left boundary if ( bAlignment.Position >= CurrentLeft) { return true; } - // return whether alignment end overlaps left boundary + // return whether alignment end overlaps left boundary return ( bAlignment.GetEndPosition() >= CurrentLeft ); } diff --git a/Makefile b/Makefile index fd08519..d3d8ad2 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,13 @@ CXX= g++ CXXFLAGS= -Wall -O3 -PROG= BamConversion BamDump BamTrim bamtools +PROG= bamtools bamtools_test LIBS= -lz +OBJS= BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o all: $(PROG) -bamtools: BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o - $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o $(LIBS) - -BamConversion: BGZF.o BamReader.o BamWriter.o BamConversionMain.o - $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamConversionMain.o $(LIBS) - -BamDump: BGZF.o BamReader.o BamDumpMain.o - $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamDumpMain.o $(LIBS) - -BamTrim: BGZF.o BamReader.o BamWriter.o BamTrimMain.o - $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamTrimMain.o $(LIBS) +bamtools: $(OBJS) + $(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LIBS) clean: - rm -fr gmon.out *.o *.a a.out *~ $(PROG) + rm -fr gmon.out *.o *.a a.out *~ diff --git a/README b/README index 82c9efd..7de7fd1 100644 --- a/README +++ b/README @@ -2,33 +2,72 @@ README : BAMTOOLS ------------------------------------------------------------ -BamTools: a C++ API for reading/writing BAM files. +BamTools: a C++ API & toolkit for reading/writing/manipulating BAM files. + +I. Introduction + a. The API + b. The Toolkit + +II. Usage + a. The API + b. The Toolkit -I. Introduction -II. Usage III. Contact ------------------------------------------------------------ I. Introduction: +BamTools provides both a programmer's API and an end-user's toolkit for handling +BAM files. +Ia. The API The API consists of 2 main modules - BamReader and BamWriter. As you would expect, BamReader provides read-access to BAM files, while BamWriter does the writing of BAM files. BamReader provides an interface for random-access (jumping) in a BAM file, as well as generating BAM index files. +BamMultiReader is an extra module that allows you to manage multiple open BAM file +for reading. It provides some validation & bookkeeping under the hood to keep all +files sync'ed for + An additional file, BamAux.h, is included as well. This file contains the common data structures and typedefs used throught the API. BGZF.h & BGZF.cpp contain our implementation of the Broad Institute's BGZF compression format. -BamConversion, BamDump, and BamTrim are 3 'toy' examples on how the API can be used. + +Ib. The Toolkit +If you've been using BamTools since the early days, you'll notice that our 'toy' API +examples (BamConversion, BamDump, and BamTrim) are now gone. In their place is a set +of features we hope you find useful. + +** More explanation here ** + +usage: bamtools [--help] COMMAND [ARGS] + +Available bamtools commands: + coverage Prints coverage statistics from the input BAM file + dump Dump BAM file contents to text output + header Prints BAM header information + index Generates index for BAM file + merge Merge multiple BAM files into single file + sort Sorts the BAM file according to some criteria + stats Prints some basic statistics from the input BAM file + +See 'bamtools help COMMAND' for more information on a specific command. + +** Follow-up explanation here ** ------------------------------------------------------------ II. Usage : +** General usage information - perhaps explain common terms, point to SAM/BAM spec, etc ** + + +IIa. The API + To use this API, you simply need to do 3 things: 1 - Drop the BamTools files somewhere the compiler can find them. @@ -45,6 +84,11 @@ To use this API, you simply need to do 3 things: See any included programs and Makefile for more specific compiling/usage examples. See comments in the header files for more detailed API documentation. + +IIb. The Toolkit + +** More indepth overview for the toolkit commands ** + ------------------------------------------------------------ III. Contact : @@ -52,5 +96,9 @@ III. Contact : Feel free to contact me with any questions, comments, suggestions, bug reports, etc. - Derek Barnett +Marth Lab +Biology Dept., Boston College + Email: barnetde@bc.edu -Project Website: http://sourceforge.net/projects/bamtools +Project Websites: http://github.com/pezmaster31/bamtools (ACTIVE SUPPORT) + http://sourceforge.net/projects/bamtools (major updates only) diff --git a/bamtools.cpp b/bamtools.cpp index ae3deed..d80d960 100644 --- a/bamtools.cpp +++ b/bamtools.cpp @@ -1,162 +1,116 @@ // *************************************************************************** -// bamtools.cpp (c) 2010 Erik Garrison +// bamtools.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- +// Last modified: 26 May 2010 +// --------------------------------------------------------------------------- // Integrates a number of BamTools functionalities into a single executable. // *************************************************************************** // Std C/C++ includes -#include #include -#include -#include -using namespace std; // BamTools includes -#include "BamReader.h" -#include "BamWriter.h" -#include "BamMultiReader.h" -using namespace BamTools; - -void usageSummary(string cmdname) { - cerr << "usage: " << cmdname << " [options]" << endl - << "actions:" << endl - << " index # generates BAM index .bai" << endl - << " merge [ ...] # merges BAM files into a single file" << endl - << " dump [ ...] # dumps alignment summaries to stdout" << endl - << " header [ ...] # prints header, or unified header for BAM file or files" << endl; - //<< " trim " << endl; -} - +#include "bamtools_coverage.h" +#include "bamtools_dump.h" +#include "bamtools_header.h" +#include "bamtools_index.h" +#include "bamtools_merge.h" +#include "bamtools_sam.h" +#include "bamtools_sort.h" +#include "bamtools_stats.h" -void BamMerge(string outputFilename, vector filenames) { - - BamMultiReader reader; - - reader.Open(filenames, false); // opens the files without checking for indexes - - string mergedHeader = reader.GetHeaderText(); - - RefVector references = reader.GetReferenceData(); - - // open BamWriter - BamWriter writer; - writer.Open( outputFilename.c_str(), mergedHeader, references); +using namespace std; +using namespace BamTools; - BamAlignment bAlignment; - while (reader.GetNextAlignment(bAlignment)) { - // write to output file - writer.SaveAlignment(bAlignment); +// ------------------------------------------ +// bamtools subtool names +static const string COVERAGE = "coverage"; +static const string DUMP = "dump"; // <-- do we even want to keep this? I think 'bamtools sam' will be more useful anyway + // nobody's going to want what was essentially an early, bloated, debugging output +static const string HEADER = "header"; +static const string INDEX = "index"; +static const string MERGE = "merge"; +static const string SAM = "sam"; +static const string SORT = "sort"; +static const string STATS = "stats"; + +// ------------------------------------------ +// bamtools help/version names +static const string HELP = "help"; +static const string LONG_HELP = "--help"; +static const string SHORT_HELP = "-h"; + +static const string VERSION = "version"; +static const string LONG_VERSION = "--version"; +static const string SHORT_VERSION = "-v"; + +int Help(int argc, char* argv[]) { + + // 'bamtools help COMMAND' + if (argc > 2) { + if ( argv[2] == COVERAGE) return BamCoverageHelp(); + if ( argv[2] == DUMP ) return BamDumpHelp(); // keep? + if ( argv[2] == HEADER ) return BamHeaderHelp(); + if ( argv[2] == INDEX ) return BamIndexHelp(); + if ( argv[2] == MERGE ) return BamMergeHelp(); + if ( argv[2] == SAM ) return BamSamHelp(); + if ( argv[2] == SORT ) return BamSortHelp(); + if ( argv[2] == STATS ) return BamStatsHelp(); } - - // close output file - writer.Close(); - // close input files - reader.Close(); - -} - -void BamCreateIndex(const char* inputFilename) { - - // open our BAM reader - BamReader reader; - reader.Open(inputFilename); - - // create index file - reader.CreateIndex(); - - // close our file - reader.Close(); - -} - -// Spit out basic BamAlignment data -void PrintAlignment(const BamAlignment& alignment) { - cout << "---------------------------------" << endl; - cout << "Name: " << alignment.Name << endl; - cout << "Aligned to: " << alignment.RefID; - cout << ":" << alignment.Position << endl; - cout << endl; -} - -void BamDump(vector files) { - - BamMultiReader reader; - reader.Open(files); - - BamAlignment bAlignment; - while (reader.GetNextAlignment(bAlignment)) { - PrintAlignment(bAlignment); - } - - reader.Close(); - + + // either 'bamtools help' or unrecognized argument after 'help' + cerr << endl; + cerr << "usage: bamtools [--help] COMMAND [ARGS]" << endl; + cerr << endl; + cerr << "Available bamtools commands:" << endl; + cerr << "\tcoverage\tPrints coverage statistics from the input BAM file" << endl; + cerr << "\tdump\t\tDump BAM file contents to text output" << endl; // keep? + cerr << "\theader\t\tPrints BAM header information" << endl; + cerr << "\tindex\t\tGenerates index for BAM file" << endl; + cerr << "\tmerge\t\tMerge multiple BAM files into single file" << endl; + cerr << "\tsam\t\tPrints the BAM file in SAM (text) format" << endl; + cerr << "\tsort\t\tSorts the BAM file according to some criteria" << endl; + cerr << "\tstats\t\tPrints some basic statistics from the input BAM file" << endl; + cerr << endl; + cerr << "See 'bamtools help COMMAND' for more information on a specific command." << endl; + cerr << endl; + + return 0; } -void BamDumpHeader(vector files) { - - BamMultiReader reader; - reader.Open(files, false); - - cout << reader.GetHeaderText() << endl; - - reader.Close(); - +int Version(void) { + cout << endl; + cout << "bamtools v0.x.xx" << endl; + cout << "Part of BamTools API and toolkit" << endl; + cout << "Primary authors: Derek Barnett, Erik Garrison, Michael Stromberg" << endl; + cout << "(c) 2009-2010 Marth Lab, Biology Dept., Boston College" << endl; + cout << endl; + return 0; } - int main(int argc, char* argv[]) { - // validate argument count - if( argc < 3 ) { - usageSummary(argv[0]); - exit(1); - } - - string command = argv[1]; + // just 'bamtools' + if ( (argc == 1) ) return Help(argc, argv); - if (command == "index") { - - BamCreateIndex(argv[2]); - - } else if (command == "merge") { - - vector files; - string outputFile = argv[2]; - - // check if our output exists, and exit if so - ifstream output(outputFile.c_str()); - if (output.good()) { - cerr << "ERROR: output file " << outputFile << " exists, exiting." << endl; - exit(1); - } else { - output.close(); - } - - for (int i = 3; i files; - for (int i = 2; i files; - for (int i = 2; i +#include + +#include "BamReader.h" +#include "bamtools_getopt.h" + +namespace BamTools { + +int BamCoverageHelp(void) { + std::cerr << std::endl; + std::cerr << "usage:\tbamtools coverage [--in BAM file]" << std::endl; + std::cerr << "\t-i, --in\tInput BAM file to generate coverage stats\t[default=stdin]" << std::endl; + std::cerr << std::endl; + return 0; +} + +int RunBamCoverage(int argc, char* argv[]) { + + // else parse command line for args + GetOpt options(argc, argv, 1); + + std::string inputFilename; + options.addOption('i', "in", &inputFilename); + + if ( !options.parse() ) return BamCoverageHelp(); + if ( inputFilename.empty() ) { inputFilename = "stdin"; } + +// // open our BAM reader +// BamReader reader; +// reader.Open(inputFilename); + + // generate coverage stats + std::cerr << "Generating coverage stats for " << inputFilename << std::endl; + std::cerr << "FEATURE NOT YET IMPLEMENTED!" << std::endl; + + // clean & exit +// reader.Close(); + return 0; +} + +} // namespace BamTools + +#endif // BAMTOOLS_COVERAGE_H diff --git a/bamtools_dump.h b/bamtools_dump.h new file mode 100644 index 0000000..108caf2 --- /dev/null +++ b/bamtools_dump.h @@ -0,0 +1,73 @@ +// *************************************************************************** +// bamtools_dump.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 26 May 2010 +// --------------------------------------------------------------------------- +// Dumps alignment summaries out to stdout. +// +// ** This should probably go the way of the dodo soon? bamtools sam makes this +// obsolete and probably worthless. +// +// *************************************************************************** + +#ifndef BAMTOOLS_DUMP_H +#define BAMTOOLS_DUMP_H + +#include +#include +#include + +#include "BamMultiReader.h" +// #include "GetOpt.h" +#include "bamtools_getopt.h" + +namespace BamTools { + +int BamDumpHelp(void) { + std::cerr << std::endl; + std::cerr << "usage:\tbamtools dump [BAM file1] [BAM file2] [BAM file3]..." << std::endl; + std::cerr << "\t[BAM file]\tInput file(s) to dump alignment summaries from [default=stdin]" << std::endl; + std::cerr << std::endl; + return 0; +} + +// Spit out basic BamAlignment data +void PrintAlignment(const BamTools::BamAlignment& alignment) { + std::cout << "---------------------------------" << std::endl; + std::cout << "Name: " << alignment.Name << std::endl; + std::cout << "Aligned to: " << alignment.RefID; + std::cout << ":" << alignment.Position << std::endl; + std::cout << std::endl; +} + +int RunBamDump(int argc, char* argv[]) { + + // else parse command line for args + GetOpt options(argc, argv, 1); + + std::vector inputFilenames; + options.addVariableLengthOption("in", &inputFilenames); + + if ( !options.parse() ) return BamDumpHelp(); + if ( inputFilenames.empty() ) { inputFilenames.push_back("stdin"); } + + // open files + BamMultiReader reader; + reader.Open(inputFilenames, false); + + // dump alignment summaries to stdout + BamAlignment bAlignment; + while (reader.GetNextAlignment(bAlignment)) { + PrintAlignment(bAlignment); + } + + // clean up & exit + reader.Close(); + return 0; +} + +} // namespace BamTools + +#endif // BAMTOOLS_DUMP_H diff --git a/bamtools_getopt.h b/bamtools_getopt.h new file mode 100644 index 0000000..99ef6e2 --- /dev/null +++ b/bamtools_getopt.h @@ -0,0 +1,551 @@ +// *************************************************************************** +// bamtools_getopt.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 26 May 2010 +// --------------------------------------------------------------------------- +// Provides a configurable commandline parser used by the BamTools subtools +// *************************************************************************** + +#ifndef BAMTOOLS_GETOPT_H +#define BAMTOOLS_GETOPT_H + +// C includes +#include +#include + +// C++ includes +#include +#include +#include +#include + +namespace BamTools { + +class GetOpt { + + // ctors & dtor + public: + + // ctor: takes the 'standard' command line args (optional offset) + GetOpt(int argc, char* argv[], int offset = 0); + + // d-tor + ~GetOpt(void); + + // set rules for bare word arguments + public: + // add an optional 'bare word' argument (eg 'help') + // 'name' is not used on the command line, but for reporting + void addOptionalArgument(const std::string& name, std::string* value); + + // add a required 'bare word' argument (eg input data file) + // 'name' is not used on the command line, but for reporting + void addRequiredArgument(const std::string& name, std::string* value); + + // set rules for key=>value options + public: + // add standard option with arguments ( -Wall, -O2, --type=foo ) + void addOption(const char shortName, const std::string& longName, std::string* value); + + // add an option whose argument is optional (eg --log may default to dumping to stderr, unless a file is specified ) + // must provide a default string + void addOptionalOption(const char shortName, const std::string& longName, std::string* value, const std::string& defaultValue); + void addOptionalOption(const std::string& longName, std::string* value, const std::string& defaultValue); + + // add a repeatable option (like compiler includes -I/path/ -I/path2/ etc) + // only supporting one type of name (short/long) for this option for now + void addRepeatableOption(const char shortName, std::vector* values); // single char version + void addRepeatableOption(const std::string& longName, std::vector* values); // long name version + + // add an option that takes a variable number of arguments ( --files f1 f2 f3 f4... ) + void addVariableLengthOption(const std::string& longName, std::vector* values); + + // set rules for on/off switch + public: + // on/off switch ( --verbose --searchOnly ) only long names supported for now + void addSwitch(const std::string& longName, bool* ok); + + // parse and query methods + public: + + // get application name + const std::string& applicationName(void) const; + + // query if particular 'bare-word' argument is set + bool isSet(const std::string& name) const; + + // runs parser (does validation and assign values to arguments) + // returns success/fail + bool parse(void); + + void print(void); + + // define Option-related types & enums + private: + enum OptionType { OptUnknown = 0 + , OptEnd + , OptSwitch + , OptArg1 + , OptOptional + , OptRepeat + , OptVariable + }; + + // define Option + struct Option { + + // ctor + Option(OptionType t = OptUnknown, const char shortName = 0, const std::string& longName = "") + : Type(t) + , ShortName(shortName) + , LongName(longName) + , BoolValue(0) + { } + + // data members + OptionType Type; + char ShortName; + std::string LongName; + union { + bool* BoolValue; + std::string* StringValue; + std::vector* ListValue; + }; + std::string Default; + }; + + // internal methods + private: + void init(int argc, char* argv[], int offset); + void saveOption(const Option& opt); // const & ?? he doesnt use it - why? + void setSwitch(const Option& opt); + + // data members + private: + std::vector