From 11b57b2ad4d39c9140d35674600dc4a4ff658261 Mon Sep 17 00:00:00 2001 From: Derek Date: Sun, 19 Sep 2010 17:05:16 -0400 Subject: [PATCH] Added implementation of new SplitTool. This tool splits a single BAM file into multiple BAMs, based on a user-specified property. For now, properties supported are mapped/unmapped, paired/unpaired, split by reference, and split based on a given tag. --- src/toolkit/Makefile | 1 + src/toolkit/bamtools.cpp | 76 ++-- src/toolkit/bamtools_split.cpp | 725 +++++++++++++++++++++++++++++++++ src/toolkit/bamtools_split.h | 38 ++ 4 files changed, 813 insertions(+), 27 deletions(-) create mode 100644 src/toolkit/bamtools_split.cpp create mode 100644 src/toolkit/bamtools_split.h diff --git a/src/toolkit/Makefile b/src/toolkit/Makefile index 077c6cc..1b87288 100644 --- a/src/toolkit/Makefile +++ b/src/toolkit/Makefile @@ -25,6 +25,7 @@ SOURCES = bamtools_convert.cpp \ bamtools_merge.cpp \ bamtools_random.cpp \ bamtools_sort.cpp \ + bamtools_split.cpp \ bamtools_stats.cpp \ bamtools.cpp OBJECTS= $(SOURCES:.cpp=.o) diff --git a/src/toolkit/bamtools.cpp b/src/toolkit/bamtools.cpp index 20b8cf1..39ec842 100644 --- a/src/toolkit/bamtools.cpp +++ b/src/toolkit/bamtools.cpp @@ -3,15 +3,12 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 July 2010 +// Last modified: 19 September 2010 // --------------------------------------------------------------------------- // Integrates a number of BamTools functionalities into a single executable. // *************************************************************************** -// Std C/C++ includes #include - -// BamTools includes #include "bamtools_convert.h" #include "bamtools_count.h" #include "bamtools_coverage.h" @@ -21,8 +18,8 @@ #include "bamtools_merge.h" #include "bamtools_random.h" #include "bamtools_sort.h" +#include "bamtools_split.h" #include "bamtools_stats.h" - using namespace std; using namespace BamTools; @@ -37,6 +34,7 @@ static const string INDEX = "index"; static const string MERGE = "merge"; static const string RANDOM = "random"; static const string SORT = "sort"; +static const string SPLIT = "split"; static const string STATS = "stats"; // ------------------------------------------ @@ -49,6 +47,27 @@ static const string VERSION = "version"; static const string LONG_VERSION = "--version"; static const string SHORT_VERSION = "-v"; +// ------------------------------------------ +// Subtool factory method +AbstractTool* CreateTool(const string& arg) { + + // determine tool type based on arg + if ( arg == CONVERT ) return new ConvertTool; + if ( arg == COUNT ) return new CountTool; + if ( arg == COVERAGE ) return new CoverageTool; + if ( arg == FILTER ) return new FilterTool; + if ( arg == HEADER ) return new HeaderTool; + if ( arg == INDEX ) return new IndexTool; + if ( arg == MERGE ) return new MergeTool; + if ( arg == RANDOM ) return new RandomTool; + if ( arg == SORT ) return new SortTool; + if ( arg == SPLIT ) return new SplitTool; + if ( arg == STATS ) return new StatsTool; + + // unknown arg + return 0; +} + // ------------------------------------------ // Print help info int Help(int argc, char* argv[]) { @@ -56,17 +75,18 @@ int Help(int argc, char* argv[]) { // 'bamtools help COMMAND' if (argc > 2) { - AbstractTool* tool(0); - if ( argv[2] == CONVERT ) tool = new ConvertTool; - if ( argv[2] == COUNT ) tool = new CountTool; - if ( argv[2] == COVERAGE ) tool = new CoverageTool; - if ( argv[2] == FILTER ) tool = new FilterTool; - if ( argv[2] == HEADER ) tool = new HeaderTool; - if ( argv[2] == INDEX ) tool = new IndexTool; - if ( argv[2] == MERGE ) tool = new MergeTool; - if ( argv[2] == RANDOM ) tool = new RandomTool; - if ( argv[2] == SORT ) tool = new SortTool; - if ( argv[2] == STATS ) tool = new StatsTool; + AbstractTool* tool = CreateTool( argv[2] ); +// if ( argv[2] == CONVERT ) tool = new ConvertTool; +// if ( argv[2] == COUNT ) tool = new CountTool; +// if ( argv[2] == COVERAGE ) tool = new CoverageTool; +// if ( argv[2] == FILTER ) tool = new FilterTool; +// if ( argv[2] == HEADER ) tool = new HeaderTool; +// if ( argv[2] == INDEX ) tool = new IndexTool; +// if ( argv[2] == MERGE ) tool = new MergeTool; +// if ( argv[2] == RANDOM ) tool = new RandomTool; +// if ( argv[2] == SORT ) tool = new SortTool; +// if ( argv[2] == SPLIT ) tool = new SplitTool; +// if ( argv[2] == STATS ) tool = new StatsTool; // if tool known, print its help screen if ( tool ) return tool->Help(); @@ -86,6 +106,7 @@ int Help(int argc, char* argv[]) { cerr << "\tmerge Merge multiple BAM files into single file" << endl; cerr << "\trandom Select random alignments from existing BAM file(s)" << endl; cerr << "\tsort Sorts the BAM file according to some criteria" << endl; + cerr << "\tsplit Splits a BAM file on user-specified property, creating a new BAM output file for each value found" << endl; cerr << "\tstats Prints some basic statistics from input BAM file(s)" << endl; cerr << endl; cerr << "See 'bamtools help COMMAND' for more information on a specific command." << endl; @@ -119,17 +140,18 @@ int main(int argc, char* argv[]) { if ( (argv[1] == VERSION) || (argv[1] == LONG_VERSION) || (argv[1] == SHORT_VERSION) ) return Version(); // determine desired sub-tool - AbstractTool* tool(0); - if ( argv[1] == CONVERT ) tool = new ConvertTool; - if ( argv[1] == COUNT ) tool = new CountTool; - if ( argv[1] == COVERAGE ) tool = new CoverageTool; - if ( argv[1] == FILTER ) tool = new FilterTool; - if ( argv[1] == HEADER ) tool = new HeaderTool; - if ( argv[1] == INDEX ) tool = new IndexTool; - if ( argv[1] == MERGE ) tool = new MergeTool; - if ( argv[1] == RANDOM ) tool = new RandomTool; - if ( argv[1] == SORT ) tool = new SortTool; - if ( argv[1] == STATS ) tool = new StatsTool; + AbstractTool* tool = CreateTool( argv[1] ); +// if ( argv[1] == CONVERT ) tool = new ConvertTool; +// if ( argv[1] == COUNT ) tool = new CountTool; +// if ( argv[1] == COVERAGE ) tool = new CoverageTool; +// if ( argv[1] == FILTER ) tool = new FilterTool; +// if ( argv[1] == HEADER ) tool = new HeaderTool; +// if ( argv[1] == INDEX ) tool = new IndexTool; +// if ( argv[1] == MERGE ) tool = new MergeTool; +// if ( argv[1] == RANDOM ) tool = new RandomTool; +// if ( argv[1] == SORT ) tool = new SortTool; +// if ( argv[1] == SPLIT ) tool = new SplitTool; +// if ( argv[1] == STATS ) tool = new StatsTool; // if found, run tool if ( tool ) return tool->Run(argc, argv); diff --git a/src/toolkit/bamtools_split.cpp b/src/toolkit/bamtools_split.cpp new file mode 100644 index 0000000..2560b08 --- /dev/null +++ b/src/toolkit/bamtools_split.cpp @@ -0,0 +1,725 @@ +// *************************************************************************** +// bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 September 2010 (DB) +// --------------------------------------------------------------------------- +// +// *************************************************************************** + +#include +#include +#include +#include +#include +#include +#include "bamtools_split.h" +#include "bamtools_options.h" +#include "bamtools_variant.h" +#include "BamReader.h" +#include "BamWriter.h" +using namespace std; +using namespace BamTools; + +namespace BamTools { + + // string constants + static const string SPLIT_MAPPED_TOKEN = ".MAPPED"; + static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED"; + static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END"; + static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END"; + static const string SPLIT_REFERENCE_TOKEN = ".REF_"; + + string GetTimestampString(void) { + + // get human readable timestamp + time_t currentTime; + time(¤tTime); + stringstream timeStream(""); + timeStream << ctime(¤tTime); + + // convert whitespace to '_' + string timeString = timeStream.str(); + size_t found = timeString.find(" "); + while (found != string::npos) { + timeString.replace(found, 1, "_"); + found = timeString.find(" ", found+1); + } + return timeString; + } + + // remove copy of filename without extension + // (so /path/to/file.txt becomes /path/to/file ) + string RemoveFilenameExtension(const string& filename) { + size_t found = filename.rfind("."); + return filename.substr(0, found); + } + +} // namespace BamTools + +// --------------------------------------------- +// SplitSettings implementation + +struct SplitTool::SplitSettings { + + // flags + bool HasInputFilename; + bool HasCustomOutputStub; + bool IsSplittingMapped; + bool IsSplittingPaired; + bool IsSplittingReference; + bool IsSplittingTag; + + // string args + string CustomOutputStub; + string InputFilename; + string TagToSplit; + + // constructor + SplitSettings(void) + : HasInputFilename(false) + , HasCustomOutputStub(false) + , IsSplittingMapped(false) + , IsSplittingPaired(false) + , IsSplittingReference(false) + , IsSplittingTag(false) + , CustomOutputStub("") + , InputFilename(Options::StandardIn()) + , TagToSplit("") + { } +}; + +// --------------------------------------------- +// SplitToolPrivate declaration + +class SplitTool::SplitToolPrivate { + + // ctor & dtor + public: + SplitToolPrivate(SplitTool::SplitSettings* settings); + ~SplitToolPrivate(void); + + // 'public' interface + public: + bool Run(void); + + // internal methods + private: + void DetermineOutputFilenameStub(void); + bool OpenReader(void); + bool SplitMapped(void); + bool SplitPaired(void); + bool SplitReference(void); + bool SplitTag(void); + bool SplitTag_Int(BamAlignment& al); + bool SplitTag_UInt(BamAlignment& al); + bool SplitTag_Real(BamAlignment& al); + bool SplitTag_String(BamAlignment& al); + + // data members + private: + SplitTool::SplitSettings* m_settings; + string m_outputFilenameStub; + BamReader m_reader; + string m_header; + RefVector m_references; +}; + +// constructor +SplitTool::SplitToolPrivate::SplitToolPrivate(SplitTool::SplitSettings* settings) + : m_settings(settings) +{ } + +// destructor +SplitTool::SplitToolPrivate::~SplitToolPrivate(void) { + m_reader.Close(); +} + +void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) { + + // if user supplied output filename stub, use that + if ( m_settings->HasCustomOutputStub ) + m_outputFilenameStub = m_settings->CustomOutputStub; + + // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub + else if ( m_settings->HasInputFilename ) + m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename); + + // otherwise, user did not specify -stub, and input is coming from STDIN + // generate stub from timestamp + else m_outputFilenameStub = GetTimestampString(); +} + +bool SplitTool::SplitToolPrivate::OpenReader(void) { + if ( !m_reader.Open(m_settings->InputFilename) ) { + cerr << "ERROR: SplitTool could not open BAM file: " << m_settings->InputFilename << endl; + return false; + } + m_header = m_reader.GetHeaderText(); + m_references = m_reader.GetReferenceData(); + return true; +} + +bool SplitTool::SplitToolPrivate::Run(void) { + + // determine output stub + DetermineOutputFilenameStub(); + + // open up BamReader + if ( !OpenReader() ) return false; + + // determine split type from settings + if ( m_settings->IsSplittingMapped ) return SplitMapped(); + if ( m_settings->IsSplittingPaired ) return SplitPaired(); + if ( m_settings->IsSplittingReference ) return SplitReference(); + if ( m_settings->IsSplittingTag ) return SplitTag(); + + // if we get here, no property was specified + cerr << "No property given to split on... Please use -mapped, -paired, -reference, or -tag TAG to specifiy split behavior." << endl; + return false; +} + +bool SplitTool::SplitToolPrivate::SplitMapped(void) { + + // set up splitting data structure + map outputFiles; + map::iterator writerIter; + + // iterate through alignments + BamAlignment al; + BamWriter* writer; + bool isCurrentAlignmentMapped; + while ( m_reader.GetNextAlignment(al) ) { + + // see if bool value exists + isCurrentAlignmentMapped = al.IsMapped(); + writerIter = outputFiles.find(isCurrentAlignmentMapped); + + // if no writer associated with this value + if ( writerIter == outputFiles.end() ) { + + // open new BamWriter + writer = new BamWriter; + const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN ) + ".bam"; + writer->Open(outputFilename, m_header, m_references); + + // store in map + outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) ); + } + + // else grab corresponding writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up BamWriters + map::iterator writerEnd = outputFiles.end(); + for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) { + BamWriter* writer = (*writerIter).second; + if ( writer == 0 ) continue; + writer->Close(); + delete writer; + writer = 0; + } + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitPaired(void) { + + // set up splitting data structure + map outputFiles; + map::iterator writerIter; + + // iterate through alignments + BamAlignment al; + BamWriter* writer; + bool isCurrentAlignmentPaired; + while ( m_reader.GetNextAlignment(al) ) { + + // see if bool value exists + isCurrentAlignmentPaired = al.IsPaired(); + writerIter = outputFiles.find(isCurrentAlignmentPaired); + + // if no writer associated with this value + if ( writerIter == outputFiles.end() ) { + + // open new BamWriter + writer = new BamWriter; + const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN ) + ".bam"; + writer->Open(outputFilename, m_header, m_references); + + // store in map + outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) ); + } + + // else grab corresponding writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up BamWriters + map::iterator writerEnd = outputFiles.end(); + for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) { + BamWriter* writer = (*writerIter).second; + if (writer == 0 ) continue; + writer->Close(); + delete writer; + writer = 0; + } + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitReference(void) { + + // set up splitting data structure + map outputFiles; + map::iterator writerIter; + + // iterate through alignments + BamAlignment al; + BamWriter* writer; + int32_t currentRefId; + while ( m_reader.GetNextAlignment(al) ) { + + // see if bool value exists + currentRefId = al.RefID; + writerIter = outputFiles.find(currentRefId); + + // if no writer associated with this value + if ( writerIter == outputFiles.end() ) { + + // open new BamWriter + writer = new BamWriter; + const string refName = m_references.at(currentRefId).RefName; + const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam"; + writer->Open(outputFilename, m_header, m_references); + + // store in map + outputFiles.insert( make_pair(currentRefId, writer) ); + } + + // else grab corresponding writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up BamWriters + map::iterator writerEnd = outputFiles.end(); + for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) { + BamWriter* writer = (*writerIter).second; + if (writer == 0 ) continue; + writer->Close(); + delete writer; + writer = 0; + } + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitTag(void) { + + // iterate through alignments, until we hit TAG + BamAlignment al; + while ( m_reader.GetNextAlignment(al) ) { + + // look for tag in this alignment and get tag type + char tagType(0); + if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) continue; + + // request split method based on tag type + // pass it the current alignment found + switch (tagType) { + + case 'c' : + case 's' : + case 'i' : + return SplitTag_Int(al); + + case 'C' : + case 'S' : + case 'I' : + return SplitTag_UInt(al); + + case 'f' : + return SplitTag_Real(al); + + case 'A': + case 'Z': + case 'H': + return SplitTag_String(al); + + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", tagType); + return false; + } + } + + // tag not found, but that's not an error - return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitTag_Int(BamAlignment& al) { + + // set up splitting data structure + map outputFiles; + map::iterator writerIter; + + // local variables + const string tag = m_settings->TagToSplit; + BamWriter* writer; + stringstream outputFilenameStream(""); + int32_t currentValue; + + // retrieve first alignment tag value + if ( al.GetTag(tag, currentValue) ) { + + // open new BamWriter, save first alignment + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + writer->SaveAlignment(al); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } + + // iterate through remaining alignments + while ( m_reader.GetNextAlignment(al) ) { + + // skip if this alignment doesn't have TAG + if ( !al.GetTag(tag, currentValue) ) continue; + + // look up tag value in map + writerIter = outputFiles.find(currentValue); + + // if no writer associated with this value + if ( writerIter == outputFiles.end() ) { + + // open new BamWriter + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } + + // else grab corresponding writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up BamWriters + map::iterator writerEnd = outputFiles.end(); + for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) { + BamWriter* writer = (*writerIter).second; + if (writer == 0 ) continue; + writer->Close(); + delete writer; + writer = 0; + } + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitTag_UInt(BamAlignment& al) { + + // set up splitting data structure + map outputFiles; + map::iterator writerIter; + + // local variables + const string tag = m_settings->TagToSplit; + BamWriter* writer; + stringstream outputFilenameStream(""); + uint32_t currentValue; + + int alignmentsIgnored = 0; + + // retrieve first alignment tag value + if ( al.GetTag(tag, currentValue) ) { + + // open new BamWriter, save first alignment + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + writer->SaveAlignment(al); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } else ++alignmentsIgnored; + + // iterate through remaining alignments + while ( m_reader.GetNextAlignment(al) ) { + + // skip if this alignment doesn't have TAG + if ( !al.GetTag(tag, currentValue) ) { ++alignmentsIgnored; continue; } + + // look up tag value in map + writerIter = outputFiles.find(currentValue); + + // if no writer associated with this value + if ( writerIter == outputFiles.end() ) { + + // open new BamWriter + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } + + // else grab corresponding writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up BamWriters + map::iterator writerEnd = outputFiles.end(); + for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) { + BamWriter* writer = (*writerIter).second; + if (writer == 0 ) continue; + writer->Close(); + delete writer; + writer = 0; + } + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitTag_Real(BamAlignment& al) { + + // set up splitting data structure + map outputFiles; + map::iterator writerIter; + + // local variables + const string tag = m_settings->TagToSplit; + BamWriter* writer; + stringstream outputFilenameStream(""); + float currentValue; + + // retrieve first alignment tag value + if ( al.GetTag(tag, currentValue) ) { + + // open new BamWriter, save first alignment + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + writer->SaveAlignment(al); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } + + // iterate through remaining alignments + while ( m_reader.GetNextAlignment(al) ) { + + // skip if this alignment doesn't have TAG + if ( !al.GetTag(tag, currentValue) ) continue; + + // look up tag value in map + writerIter = outputFiles.find(currentValue); + + // if no writer associated with this value + if ( writerIter == outputFiles.end() ) { + + // open new BamWriter + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } + + // else grab corresponding writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up BamWriters + map::iterator writerEnd = outputFiles.end(); + for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) { + BamWriter* writer = (*writerIter).second; + if (writer == 0 ) continue; + writer->Close(); + delete writer; + writer = 0; + } + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitTag_String(BamAlignment& al) { + + // set up splitting data structure + map outputFiles; + map::iterator writerIter; + + // local variables + const string tag = m_settings->TagToSplit; + BamWriter* writer; + stringstream outputFilenameStream(""); + string currentValue; + + // retrieve first alignment tag value + if ( al.GetTag(tag, currentValue) ) { + + // open new BamWriter, save first alignment + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + writer->SaveAlignment(al); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } + + // iterate through remaining alignments + while ( m_reader.GetNextAlignment(al) ) { + + // skip if this alignment doesn't have TAG + if ( !al.GetTag(tag, currentValue) ) continue; + + // look up tag value in map + writerIter = outputFiles.find(currentValue); + + // if no writer associated with this value + if ( writerIter == outputFiles.end() ) { + + // open new BamWriter + writer = new BamWriter; + outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam"; + writer->Open(outputFilenameStream.str(), m_header, m_references); + + // store in map + outputFiles.insert( make_pair(currentValue, writer) ); + + // reset stream + outputFilenameStream.str(""); + } + + // else grab corresponding writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up BamWriters + map::iterator writerEnd = outputFiles.end(); + for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) { + BamWriter* writer = (*writerIter).second; + if (writer == 0 ) continue; + writer->Close(); + delete writer; + writer = 0; + } + + // return success + return true; +} + +// --------------------------------------------- +// SplitTool implementation + +SplitTool::SplitTool(void) + : AbstractTool() + , m_settings(new SplitSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in ] [-stub ] < -mapped | -paired | -reference | -tag > "); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts); + + OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options"); + Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts); + Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts); + Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts); + Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "", + m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts); +} + +SplitTool::~SplitTool(void) { + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int SplitTool::Help(void) { + Options::DisplayHelp(); + return 0; +} + +int SplitTool::Run(int argc, char* argv[]) { + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize internal implementation + m_impl = new SplitToolPrivate(m_settings); + + // run tool, return success/fail + if ( m_impl->Run() ) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_split.h b/src/toolkit/bamtools_split.h new file mode 100644 index 0000000..dd825f1 --- /dev/null +++ b/src/toolkit/bamtools_split.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_split.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 September 2010 (DB) +// --------------------------------------------------------------------------- +// +// *************************************************************************** + +#ifndef BAMTOOLS_SPLIT_H +#define BAMTOOLS_SPLIT_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class SplitTool : public AbstractTool { + + public: + SplitTool(void); + ~SplitTool(void); + + public: + int Help(void); + int Run(int argc, char* argv[]); + + private: + struct SplitSettings; + SplitSettings* m_settings; + + struct SplitToolPrivate; + SplitToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_SPLIT_H \ No newline at end of file -- 2.39.2