// ***************************************************************************
// bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 20 September 2010 (DB)
+// Last modified: 7 April 2011 (DB)
// ---------------------------------------------------------------------------
-//
+// Splits a BAM file on user-specified property, creating a new BAM output
+// file for each value found
// ***************************************************************************
+#include "bamtools_split.h"
+
+#include <api/BamConstants.h>
+#include <api/BamReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_variant.h>
+using namespace BamTools;
+
#include <ctime>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include <vector>
-#include "bamtools_split.h"
-#include "bamtools_options.h"
-#include "bamtools_variant.h"
-#include "BamReader.h"
-#include "BamWriter.h"
using namespace std;
-using namespace BamTools;
namespace BamTools {
- // string constants
- static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
- static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
- static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
- static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
- static const string SPLIT_REFERENCE_TOKEN = ".REF_";
-
- string GetTimestampString(void) {
-
- // get human readable timestamp
- time_t currentTime;
- time(¤tTime);
- stringstream timeStream("");
- timeStream << ctime(¤tTime);
-
- // convert whitespace to '_'
- string timeString = timeStream.str();
- size_t found = timeString.find(" ");
- while (found != string::npos) {
- timeString.replace(found, 1, "_");
- found = timeString.find(" ", found+1);
- }
- return timeString;
- }
-
- // remove copy of filename without extension
- // (so /path/to/file.txt becomes /path/to/file )
- string RemoveFilenameExtension(const string& filename) {
- size_t found = filename.rfind(".");
- return filename.substr(0, found);
+// string constants
+static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
+static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
+static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
+static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
+static const string SPLIT_REFERENCE_TOKEN = ".REF_";
+
+string GetTimestampString(void) {
+
+ // get human readable timestamp
+ time_t currentTime;
+ time(¤tTime);
+ stringstream timeStream("");
+ timeStream << ctime(¤tTime);
+
+ // convert whitespace to '_'
+ string timeString = timeStream.str();
+ size_t found = timeString.find(" ");
+ while (found != string::npos) {
+ timeString.replace(found, 1, "_");
+ found = timeString.find(" ", found+1);
}
+ return timeString;
+}
+
+// remove copy of filename without extension
+// (so /path/to/file.txt becomes /path/to/file )
+string RemoveFilenameExtension(const string& filename) {
+ size_t found = filename.rfind(".");
+ return filename.substr(0, found);
+}
} // namespace BamTools
// ctor & dtor
public:
- SplitToolPrivate(SplitTool::SplitSettings* settings);
- ~SplitToolPrivate(void);
+ SplitToolPrivate(SplitTool::SplitSettings* settings)
+ : m_settings(settings)
+ { }
+
+ ~SplitToolPrivate(void) {
+ m_reader.Close();
+ }
// 'public' interface
public:
RefVector m_references;
};
-// constructor
-SplitTool::SplitToolPrivate::SplitToolPrivate(SplitTool::SplitSettings* settings)
- : m_settings(settings)
-{ }
-
-// destructor
-SplitTool::SplitToolPrivate::~SplitToolPrivate(void) {
- m_reader.Close();
-}
-
void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
// if user supplied output filename stub, use that
}
bool SplitTool::SplitToolPrivate::OpenReader(void) {
+
+ // attempt to open BAM file
if ( !m_reader.Open(m_settings->InputFilename) ) {
- cerr << "ERROR: SplitTool could not open BAM file: " << m_settings->InputFilename << endl;
+ cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl;
return false;
}
+
+ // save file 'metadata' & return success
m_header = m_reader.GetHeaderText();
m_references = m_reader.GetReferenceData();
return true;
DetermineOutputFilenameStub();
// open up BamReader
- if ( !OpenReader() ) return false;
+ if ( !OpenReader() )
+ return false;
// determine split type from settings
if ( m_settings->IsSplittingMapped ) return SplitMapped();
if ( m_settings->IsSplittingTag ) return SplitTag();
// if we get here, no property was specified
- cerr << "No property given to split on... Please use -mapped, -paired, -reference, or -tag TAG to specifiy split behavior." << endl;
+ cerr << "bamtools split ERROR: no property given to split on... " << endl
+ << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl;
return false;
}
if ( writerIter == outputFiles.end() ) {
// open new BamWriter
+ const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped
+ ? SPLIT_MAPPED_TOKEN
+ : SPLIT_UNMAPPED_TOKEN ) + ".bam";
writer = new BamWriter;
- const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN ) + ".bam";
- writer->Open(outputFilename, m_header, m_references);
+ if ( !writer->Open(outputFilename, m_header, m_references) ) {
+ cerr << "bamtools split ERROR: could not open " << outputFilename
+ << " for writing." << endl;
+ return false;
+ }
// store in map
outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
else writer = (*writerIter).second;
// store alignment in proper BAM output file
- if ( writer )
+ if ( writer )
writer->SaveAlignment(al);
}
if ( writerIter == outputFiles.end() ) {
// open new BamWriter
+ const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired
+ ? SPLIT_PAIRED_TOKEN
+ : SPLIT_SINGLE_TOKEN ) + ".bam";
writer = new BamWriter;
- const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN ) + ".bam";
- writer->Open(outputFilename, m_header, m_references);
+ if ( !writer->Open(outputFilename, m_header, m_references) ) {
+ cerr << "bamtool split ERROR: could not open " << outputFilename
+ << " for writing." << endl;
+ return false;
+ }
// store in map
outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
if ( writerIter == outputFiles.end() ) {
// open new BamWriter
- writer = new BamWriter;
const string refName = m_references.at(currentRefId).RefName;
const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
- writer->Open(outputFilename, m_header, m_references);
-
+ writer = new BamWriter;
+ if ( !writer->Open(outputFilename, m_header, m_references) ) {
+ cerr << "bamtools split ERROR: could not open " << outputFilename
+ << " for writing." << endl;
+ return false;
+ }
+
// store in map
outputFiles.insert( make_pair(currentRefId, writer) );
}
// look for tag in this alignment and get tag type
char tagType(0);
- if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) continue;
+ if ( !al.GetTagType(m_settings->TagToSplit, tagType) )
+ continue;
// request split method based on tag type
// pass it the current alignment found
- switch (tagType) {
+ switch ( tagType ) {
- case 'c' :
- case 's' :
- case 'i' :
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
return SplitTagImpl<int32_t>(al);
- case 'C' :
- case 'S' :
- case 'I' :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
return SplitTagImpl<uint32_t>(al);
- case 'f' :
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
return SplitTagImpl<float>(al);
- case 'A':
- case 'Z':
- case 'H':
+ case (Constants::BAM_TAG_TYPE_ASCII) :
+ case (Constants::BAM_TAG_TYPE_STRING) :
+ case (Constants::BAM_TAG_TYPE_HEX) :
return SplitTagImpl<string>(al);
default:
- fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", tagType);
+ fprintf(stderr, "bamtools split ERROR: unknown tag type encountered: [%c]\n", tagType);
return false;
}
}
// --------------------------------------------------------------------------------
// template method implementation
-// N.B. - *technical note* - use of template methods defined in ".cpp" goes against normal practices
-// but works here because these are purely internal (no one can call from outside this file)
+// *Technical Note* - use of template methods declared & defined in ".cpp" file
+// goes against normal practices, but works here because these
+// are purely internal (no one can call from outside this file)
// close BamWriters & delete pointers
template<typename T>
typedef map<T, BamWriter*> WriterMap;
typedef typename WriterMap::iterator WriterMapIterator;
+ // iterate over writers
WriterMapIterator writerIter = writers.begin();
WriterMapIterator writerEnd = writers.end();
for ( ; writerIter != writerEnd; ++writerIter ) {
BamWriter* writer = (*writerIter).second;
- if (writer == 0 ) continue;
+ if ( writer == 0 ) continue;
+
+ // close BamWriter
writer->Close();
+
+ // destroy BamWriter
delete writer;
writer = 0;
}
+
+ // clear the container (destroying the items doesn't remove them)
writers.clear();
}
if ( al.GetTag(tag, currentValue) ) {
// open new BamWriter, save first alignment
- writer = new BamWriter;
outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
- writer->Open(outputFilenameStream.str(), m_header, m_references);
+ writer = new BamWriter;
+ if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
+ cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
+ << " for writing." << endl;
+ return false;
+ }
writer->SaveAlignment(al);
// store in map
if ( writerIter == outputFiles.end() ) {
// open new BamWriter
- writer = new BamWriter;
outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
- writer->Open(outputFilenameStream.str(), m_header, m_references);
-
+ writer = new BamWriter;
+ if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
+ cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
+ << " for writing." << endl;
+ return false;
+ }
+
// store in map
outputFiles.insert( make_pair(currentValue, writer) );
, m_impl(0)
{
// set program details
- Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename>] < -mapped | -paired | -reference | -tag <TAG> > ");
+ Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference | -tag <TAG> > ");
// set up options
OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
// parse command line arguments
Options::Parse(argc, argv, 1);
- // initialize internal implementation
+ // initialize SplitTool with settings
m_impl = new SplitToolPrivate(m_settings);
- // run tool, return success/fail
+ // run SplitTool, return success/fail
if ( m_impl->Run() )
return 0;
else