1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 20 September 2010 (DB)
7 // ---------------------------------------------------------------------------
9 // ***************************************************************************
17 #include "bamtools_split.h"
18 #include "bamtools_options.h"
19 #include "bamtools_variant.h"
20 #include "BamReader.h"
21 #include "BamWriter.h"
23 using namespace BamTools;
28 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
29 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
30 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
31 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
32 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
34 string GetTimestampString(void) {
36 // get human readable timestamp
39 stringstream timeStream("");
40 timeStream << ctime(¤tTime);
42 // convert whitespace to '_'
43 string timeString = timeStream.str();
44 size_t found = timeString.find(" ");
45 while (found != string::npos) {
46 timeString.replace(found, 1, "_");
47 found = timeString.find(" ", found+1);
52 // remove copy of filename without extension
53 // (so /path/to/file.txt becomes /path/to/file )
54 string RemoveFilenameExtension(const string& filename) {
55 size_t found = filename.rfind(".");
56 return filename.substr(0, found);
59 } // namespace BamTools
61 // ---------------------------------------------
62 // SplitSettings implementation
64 struct SplitTool::SplitSettings {
67 bool HasInputFilename;
68 bool HasCustomOutputStub;
69 bool IsSplittingMapped;
70 bool IsSplittingPaired;
71 bool IsSplittingReference;
75 string CustomOutputStub;
81 : HasInputFilename(false)
82 , HasCustomOutputStub(false)
83 , IsSplittingMapped(false)
84 , IsSplittingPaired(false)
85 , IsSplittingReference(false)
86 , IsSplittingTag(false)
87 , CustomOutputStub("")
88 , InputFilename(Options::StandardIn())
93 // ---------------------------------------------
94 // SplitToolPrivate declaration
96 class SplitTool::SplitToolPrivate {
100 SplitToolPrivate(SplitTool::SplitSettings* settings);
101 ~SplitToolPrivate(void);
103 // 'public' interface
109 // close & delete BamWriters in map
111 void CloseWriters(map<T, BamWriter*>& writers);
112 // calculate output stub based on IO args given
113 void DetermineOutputFilenameStub(void);
114 // open our BamReader
115 bool OpenReader(void);
116 // split alignments in BAM file based on isMapped property
117 bool SplitMapped(void);
118 // split alignments in BAM file based on isPaired property
119 bool SplitPaired(void);
120 // split alignments in BAM file based on refID property
121 bool SplitReference(void);
122 // finds first alignment and calls corresponding SplitTagImpl<>
123 // depending on tag type
125 // templated split tag implementation
126 // handle the various types that are possible for tags
128 bool SplitTagImpl(BamAlignment& al);
132 SplitTool::SplitSettings* m_settings;
133 string m_outputFilenameStub;
136 RefVector m_references;
140 SplitTool::SplitToolPrivate::SplitToolPrivate(SplitTool::SplitSettings* settings)
141 : m_settings(settings)
145 SplitTool::SplitToolPrivate::~SplitToolPrivate(void) {
149 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
151 // if user supplied output filename stub, use that
152 if ( m_settings->HasCustomOutputStub )
153 m_outputFilenameStub = m_settings->CustomOutputStub;
155 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
156 else if ( m_settings->HasInputFilename )
157 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
159 // otherwise, user did not specify -stub, and input is coming from STDIN
160 // generate stub from timestamp
161 else m_outputFilenameStub = GetTimestampString();
164 bool SplitTool::SplitToolPrivate::OpenReader(void) {
165 if ( !m_reader.Open(m_settings->InputFilename) ) {
166 cerr << "ERROR: SplitTool could not open BAM file: " << m_settings->InputFilename << endl;
169 m_header = m_reader.GetHeaderText();
170 m_references = m_reader.GetReferenceData();
174 bool SplitTool::SplitToolPrivate::Run(void) {
176 // determine output stub
177 DetermineOutputFilenameStub();
180 if ( !OpenReader() ) return false;
182 // determine split type from settings
183 if ( m_settings->IsSplittingMapped ) return SplitMapped();
184 if ( m_settings->IsSplittingPaired ) return SplitPaired();
185 if ( m_settings->IsSplittingReference ) return SplitReference();
186 if ( m_settings->IsSplittingTag ) return SplitTag();
188 // if we get here, no property was specified
189 cerr << "No property given to split on... Please use -mapped, -paired, -reference, or -tag TAG to specifiy split behavior." << endl;
193 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
195 // set up splitting data structure
196 map<bool, BamWriter*> outputFiles;
197 map<bool, BamWriter*>::iterator writerIter;
199 // iterate through alignments
202 bool isCurrentAlignmentMapped;
203 while ( m_reader.GetNextAlignment(al) ) {
205 // see if bool value exists
206 isCurrentAlignmentMapped = al.IsMapped();
207 writerIter = outputFiles.find(isCurrentAlignmentMapped);
209 // if no writer associated with this value
210 if ( writerIter == outputFiles.end() ) {
212 // open new BamWriter
213 writer = new BamWriter;
214 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN ) + ".bam";
215 writer->Open(outputFilename, m_header, m_references);
218 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
221 // else grab corresponding writer
222 else writer = (*writerIter).second;
224 // store alignment in proper BAM output file
226 writer->SaveAlignment(al);
229 // clean up BamWriters
230 CloseWriters(outputFiles);
236 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
238 // set up splitting data structure
239 map<bool, BamWriter*> outputFiles;
240 map<bool, BamWriter*>::iterator writerIter;
242 // iterate through alignments
245 bool isCurrentAlignmentPaired;
246 while ( m_reader.GetNextAlignment(al) ) {
248 // see if bool value exists
249 isCurrentAlignmentPaired = al.IsPaired();
250 writerIter = outputFiles.find(isCurrentAlignmentPaired);
252 // if no writer associated with this value
253 if ( writerIter == outputFiles.end() ) {
255 // open new BamWriter
256 writer = new BamWriter;
257 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN ) + ".bam";
258 writer->Open(outputFilename, m_header, m_references);
261 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
264 // else grab corresponding writer
265 else writer = (*writerIter).second;
267 // store alignment in proper BAM output file
269 writer->SaveAlignment(al);
272 // clean up BamWriters
273 CloseWriters(outputFiles);
279 bool SplitTool::SplitToolPrivate::SplitReference(void) {
281 // set up splitting data structure
282 map<int32_t, BamWriter*> outputFiles;
283 map<int32_t, BamWriter*>::iterator writerIter;
285 // iterate through alignments
288 int32_t currentRefId;
289 while ( m_reader.GetNextAlignment(al) ) {
291 // see if bool value exists
292 currentRefId = al.RefID;
293 writerIter = outputFiles.find(currentRefId);
295 // if no writer associated with this value
296 if ( writerIter == outputFiles.end() ) {
298 // open new BamWriter
299 writer = new BamWriter;
300 const string refName = m_references.at(currentRefId).RefName;
301 const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
302 writer->Open(outputFilename, m_header, m_references);
305 outputFiles.insert( make_pair(currentRefId, writer) );
308 // else grab corresponding writer
309 else writer = (*writerIter).second;
311 // store alignment in proper BAM output file
313 writer->SaveAlignment(al);
316 // clean up BamWriters
317 CloseWriters(outputFiles);
323 // finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
324 bool SplitTool::SplitToolPrivate::SplitTag(void) {
326 // iterate through alignments, until we hit TAG
328 while ( m_reader.GetNextAlignment(al) ) {
330 // look for tag in this alignment and get tag type
332 if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) continue;
334 // request split method based on tag type
335 // pass it the current alignment found
341 return SplitTagImpl<int32_t>(al);
346 return SplitTagImpl<uint32_t>(al);
349 return SplitTagImpl<float>(al);
354 return SplitTagImpl<string>(al);
357 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", tagType);
362 // tag not found, but that's not an error - return success
366 // --------------------------------------------------------------------------------
367 // template method implementation
368 // N.B. - *technical note* - use of template methods defined in ".cpp" goes against normal practices
369 // but works here because these are purely internal (no one can call from outside this file)
371 // close BamWriters & delete pointers
373 void SplitTool::SplitToolPrivate::CloseWriters(map<T, BamWriter*>& writers) {
375 typedef map<T, BamWriter*> WriterMap;
376 typedef typename WriterMap::iterator WriterMapIterator;
378 WriterMapIterator writerIter = writers.begin();
379 WriterMapIterator writerEnd = writers.end();
380 for ( ; writerIter != writerEnd; ++writerIter ) {
381 BamWriter* writer = (*writerIter).second;
382 if (writer == 0 ) continue;
390 // handle the various types that are possible for tags
392 bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) {
394 typedef T TagValueType;
395 typedef map<TagValueType, BamWriter*> WriterMap;
396 typedef typename WriterMap::iterator WriterMapIterator;
398 // set up splitting data structure
399 WriterMap outputFiles;
400 WriterMapIterator writerIter;
403 const string tag = m_settings->TagToSplit;
405 stringstream outputFilenameStream("");
406 TagValueType currentValue;
408 // retrieve first alignment tag value
409 if ( al.GetTag(tag, currentValue) ) {
411 // open new BamWriter, save first alignment
412 writer = new BamWriter;
413 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
414 writer->Open(outputFilenameStream.str(), m_header, m_references);
415 writer->SaveAlignment(al);
418 outputFiles.insert( make_pair(currentValue, writer) );
421 outputFilenameStream.str("");
424 // iterate through remaining alignments
425 while ( m_reader.GetNextAlignment(al) ) {
427 // skip if this alignment doesn't have TAG
428 if ( !al.GetTag(tag, currentValue) ) continue;
430 // look up tag value in map
431 writerIter = outputFiles.find(currentValue);
433 // if no writer associated with this value
434 if ( writerIter == outputFiles.end() ) {
436 // open new BamWriter
437 writer = new BamWriter;
438 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
439 writer->Open(outputFilenameStream.str(), m_header, m_references);
442 outputFiles.insert( make_pair(currentValue, writer) );
445 outputFilenameStream.str("");
448 // else grab corresponding writer
449 else writer = (*writerIter).second;
451 // store alignment in proper BAM output file
453 writer->SaveAlignment(al);
456 // clean up BamWriters
457 CloseWriters(outputFiles);
463 // ---------------------------------------------
464 // SplitTool implementation
466 SplitTool::SplitTool(void)
468 , m_settings(new SplitSettings)
471 // set program details
472 Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference | -tag <TAG> > ");
475 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
476 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
477 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
479 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
480 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
481 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
482 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
483 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
484 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
487 SplitTool::~SplitTool(void) {
496 int SplitTool::Help(void) {
497 Options::DisplayHelp();
501 int SplitTool::Run(int argc, char* argv[]) {
503 // parse command line arguments
504 Options::Parse(argc, argv, 1);
506 // initialize internal implementation
507 m_impl = new SplitToolPrivate(m_settings);
509 // run tool, return success/fail