1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 19 September 2010 (DB)
7 // ---------------------------------------------------------------------------
9 // ***************************************************************************
17 #include "bamtools_split.h"
18 #include "bamtools_options.h"
19 #include "bamtools_variant.h"
20 #include "BamReader.h"
21 #include "BamWriter.h"
23 using namespace BamTools;
28 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
29 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
30 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
31 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
32 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
34 string GetTimestampString(void) {
36 // get human readable timestamp
39 stringstream timeStream("");
40 timeStream << ctime(¤tTime);
42 // convert whitespace to '_'
43 string timeString = timeStream.str();
44 size_t found = timeString.find(" ");
45 while (found != string::npos) {
46 timeString.replace(found, 1, "_");
47 found = timeString.find(" ", found+1);
52 // remove copy of filename without extension
53 // (so /path/to/file.txt becomes /path/to/file )
54 string RemoveFilenameExtension(const string& filename) {
55 size_t found = filename.rfind(".");
56 return filename.substr(0, found);
59 } // namespace BamTools
61 // ---------------------------------------------
62 // SplitSettings implementation
64 struct SplitTool::SplitSettings {
67 bool HasInputFilename;
68 bool HasCustomOutputStub;
69 bool IsSplittingMapped;
70 bool IsSplittingPaired;
71 bool IsSplittingReference;
75 string CustomOutputStub;
81 : HasInputFilename(false)
82 , HasCustomOutputStub(false)
83 , IsSplittingMapped(false)
84 , IsSplittingPaired(false)
85 , IsSplittingReference(false)
86 , IsSplittingTag(false)
87 , CustomOutputStub("")
88 , InputFilename(Options::StandardIn())
93 // ---------------------------------------------
94 // SplitToolPrivate declaration
96 class SplitTool::SplitToolPrivate {
100 SplitToolPrivate(SplitTool::SplitSettings* settings);
101 ~SplitToolPrivate(void);
103 // 'public' interface
109 void DetermineOutputFilenameStub(void);
110 bool OpenReader(void);
111 bool SplitMapped(void);
112 bool SplitPaired(void);
113 bool SplitReference(void);
115 bool SplitTag_Int(BamAlignment& al);
116 bool SplitTag_UInt(BamAlignment& al);
117 bool SplitTag_Real(BamAlignment& al);
118 bool SplitTag_String(BamAlignment& al);
122 SplitTool::SplitSettings* m_settings;
123 string m_outputFilenameStub;
126 RefVector m_references;
130 SplitTool::SplitToolPrivate::SplitToolPrivate(SplitTool::SplitSettings* settings)
131 : m_settings(settings)
135 SplitTool::SplitToolPrivate::~SplitToolPrivate(void) {
139 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
141 // if user supplied output filename stub, use that
142 if ( m_settings->HasCustomOutputStub )
143 m_outputFilenameStub = m_settings->CustomOutputStub;
145 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
146 else if ( m_settings->HasInputFilename )
147 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
149 // otherwise, user did not specify -stub, and input is coming from STDIN
150 // generate stub from timestamp
151 else m_outputFilenameStub = GetTimestampString();
154 bool SplitTool::SplitToolPrivate::OpenReader(void) {
155 if ( !m_reader.Open(m_settings->InputFilename) ) {
156 cerr << "ERROR: SplitTool could not open BAM file: " << m_settings->InputFilename << endl;
159 m_header = m_reader.GetHeaderText();
160 m_references = m_reader.GetReferenceData();
164 bool SplitTool::SplitToolPrivate::Run(void) {
166 // determine output stub
167 DetermineOutputFilenameStub();
170 if ( !OpenReader() ) return false;
172 // determine split type from settings
173 if ( m_settings->IsSplittingMapped ) return SplitMapped();
174 if ( m_settings->IsSplittingPaired ) return SplitPaired();
175 if ( m_settings->IsSplittingReference ) return SplitReference();
176 if ( m_settings->IsSplittingTag ) return SplitTag();
178 // if we get here, no property was specified
179 cerr << "No property given to split on... Please use -mapped, -paired, -reference, or -tag TAG to specifiy split behavior." << endl;
183 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
185 // set up splitting data structure
186 map<bool, BamWriter*> outputFiles;
187 map<bool, BamWriter*>::iterator writerIter;
189 // iterate through alignments
192 bool isCurrentAlignmentMapped;
193 while ( m_reader.GetNextAlignment(al) ) {
195 // see if bool value exists
196 isCurrentAlignmentMapped = al.IsMapped();
197 writerIter = outputFiles.find(isCurrentAlignmentMapped);
199 // if no writer associated with this value
200 if ( writerIter == outputFiles.end() ) {
202 // open new BamWriter
203 writer = new BamWriter;
204 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN ) + ".bam";
205 writer->Open(outputFilename, m_header, m_references);
208 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
211 // else grab corresponding writer
212 else writer = (*writerIter).second;
214 // store alignment in proper BAM output file
216 writer->SaveAlignment(al);
219 // clean up BamWriters
220 map<bool, BamWriter*>::iterator writerEnd = outputFiles.end();
221 for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) {
222 BamWriter* writer = (*writerIter).second;
223 if ( writer == 0 ) continue;
233 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
235 // set up splitting data structure
236 map<bool, BamWriter*> outputFiles;
237 map<bool, BamWriter*>::iterator writerIter;
239 // iterate through alignments
242 bool isCurrentAlignmentPaired;
243 while ( m_reader.GetNextAlignment(al) ) {
245 // see if bool value exists
246 isCurrentAlignmentPaired = al.IsPaired();
247 writerIter = outputFiles.find(isCurrentAlignmentPaired);
249 // if no writer associated with this value
250 if ( writerIter == outputFiles.end() ) {
252 // open new BamWriter
253 writer = new BamWriter;
254 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN ) + ".bam";
255 writer->Open(outputFilename, m_header, m_references);
258 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
261 // else grab corresponding writer
262 else writer = (*writerIter).second;
264 // store alignment in proper BAM output file
266 writer->SaveAlignment(al);
269 // clean up BamWriters
270 map<bool, BamWriter*>::iterator writerEnd = outputFiles.end();
271 for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) {
272 BamWriter* writer = (*writerIter).second;
273 if (writer == 0 ) continue;
283 bool SplitTool::SplitToolPrivate::SplitReference(void) {
285 // set up splitting data structure
286 map<int32_t, BamWriter*> outputFiles;
287 map<int32_t, BamWriter*>::iterator writerIter;
289 // iterate through alignments
292 int32_t currentRefId;
293 while ( m_reader.GetNextAlignment(al) ) {
295 // see if bool value exists
296 currentRefId = al.RefID;
297 writerIter = outputFiles.find(currentRefId);
299 // if no writer associated with this value
300 if ( writerIter == outputFiles.end() ) {
302 // open new BamWriter
303 writer = new BamWriter;
304 const string refName = m_references.at(currentRefId).RefName;
305 const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
306 writer->Open(outputFilename, m_header, m_references);
309 outputFiles.insert( make_pair(currentRefId, writer) );
312 // else grab corresponding writer
313 else writer = (*writerIter).second;
315 // store alignment in proper BAM output file
317 writer->SaveAlignment(al);
320 // clean up BamWriters
321 map<int32_t, BamWriter*>::iterator writerEnd = outputFiles.end();
322 for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
323 BamWriter* writer = (*writerIter).second;
324 if (writer == 0 ) continue;
334 bool SplitTool::SplitToolPrivate::SplitTag(void) {
336 // iterate through alignments, until we hit TAG
338 while ( m_reader.GetNextAlignment(al) ) {
340 // look for tag in this alignment and get tag type
342 if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) continue;
344 // request split method based on tag type
345 // pass it the current alignment found
351 return SplitTag_Int(al);
356 return SplitTag_UInt(al);
359 return SplitTag_Real(al);
364 return SplitTag_String(al);
367 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", tagType);
372 // tag not found, but that's not an error - return success
376 bool SplitTool::SplitToolPrivate::SplitTag_Int(BamAlignment& al) {
378 // set up splitting data structure
379 map<int32_t, BamWriter*> outputFiles;
380 map<int32_t, BamWriter*>::iterator writerIter;
383 const string tag = m_settings->TagToSplit;
385 stringstream outputFilenameStream("");
386 int32_t currentValue;
388 // retrieve first alignment tag value
389 if ( al.GetTag(tag, currentValue) ) {
391 // open new BamWriter, save first alignment
392 writer = new BamWriter;
393 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
394 writer->Open(outputFilenameStream.str(), m_header, m_references);
395 writer->SaveAlignment(al);
398 outputFiles.insert( make_pair(currentValue, writer) );
401 outputFilenameStream.str("");
404 // iterate through remaining alignments
405 while ( m_reader.GetNextAlignment(al) ) {
407 // skip if this alignment doesn't have TAG
408 if ( !al.GetTag(tag, currentValue) ) continue;
410 // look up tag value in map
411 writerIter = outputFiles.find(currentValue);
413 // if no writer associated with this value
414 if ( writerIter == outputFiles.end() ) {
416 // open new BamWriter
417 writer = new BamWriter;
418 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
419 writer->Open(outputFilenameStream.str(), m_header, m_references);
422 outputFiles.insert( make_pair(currentValue, writer) );
425 outputFilenameStream.str("");
428 // else grab corresponding writer
429 else writer = (*writerIter).second;
431 // store alignment in proper BAM output file
433 writer->SaveAlignment(al);
436 // clean up BamWriters
437 map<int32_t, BamWriter*>::iterator writerEnd = outputFiles.end();
438 for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
439 BamWriter* writer = (*writerIter).second;
440 if (writer == 0 ) continue;
450 bool SplitTool::SplitToolPrivate::SplitTag_UInt(BamAlignment& al) {
452 // set up splitting data structure
453 map<uint32_t, BamWriter*> outputFiles;
454 map<uint32_t, BamWriter*>::iterator writerIter;
457 const string tag = m_settings->TagToSplit;
459 stringstream outputFilenameStream("");
460 uint32_t currentValue;
462 int alignmentsIgnored = 0;
464 // retrieve first alignment tag value
465 if ( al.GetTag(tag, currentValue) ) {
467 // open new BamWriter, save first alignment
468 writer = new BamWriter;
469 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
470 writer->Open(outputFilenameStream.str(), m_header, m_references);
471 writer->SaveAlignment(al);
474 outputFiles.insert( make_pair(currentValue, writer) );
477 outputFilenameStream.str("");
478 } else ++alignmentsIgnored;
480 // iterate through remaining alignments
481 while ( m_reader.GetNextAlignment(al) ) {
483 // skip if this alignment doesn't have TAG
484 if ( !al.GetTag(tag, currentValue) ) { ++alignmentsIgnored; continue; }
486 // look up tag value in map
487 writerIter = outputFiles.find(currentValue);
489 // if no writer associated with this value
490 if ( writerIter == outputFiles.end() ) {
492 // open new BamWriter
493 writer = new BamWriter;
494 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
495 writer->Open(outputFilenameStream.str(), m_header, m_references);
498 outputFiles.insert( make_pair(currentValue, writer) );
501 outputFilenameStream.str("");
504 // else grab corresponding writer
505 else writer = (*writerIter).second;
507 // store alignment in proper BAM output file
509 writer->SaveAlignment(al);
512 // clean up BamWriters
513 map<uint32_t, BamWriter*>::iterator writerEnd = outputFiles.end();
514 for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
515 BamWriter* writer = (*writerIter).second;
516 if (writer == 0 ) continue;
526 bool SplitTool::SplitToolPrivate::SplitTag_Real(BamAlignment& al) {
528 // set up splitting data structure
529 map<float, BamWriter*> outputFiles;
530 map<float, BamWriter*>::iterator writerIter;
533 const string tag = m_settings->TagToSplit;
535 stringstream outputFilenameStream("");
538 // retrieve first alignment tag value
539 if ( al.GetTag(tag, currentValue) ) {
541 // open new BamWriter, save first alignment
542 writer = new BamWriter;
543 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
544 writer->Open(outputFilenameStream.str(), m_header, m_references);
545 writer->SaveAlignment(al);
548 outputFiles.insert( make_pair(currentValue, writer) );
551 outputFilenameStream.str("");
554 // iterate through remaining alignments
555 while ( m_reader.GetNextAlignment(al) ) {
557 // skip if this alignment doesn't have TAG
558 if ( !al.GetTag(tag, currentValue) ) continue;
560 // look up tag value in map
561 writerIter = outputFiles.find(currentValue);
563 // if no writer associated with this value
564 if ( writerIter == outputFiles.end() ) {
566 // open new BamWriter
567 writer = new BamWriter;
568 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
569 writer->Open(outputFilenameStream.str(), m_header, m_references);
572 outputFiles.insert( make_pair(currentValue, writer) );
575 outputFilenameStream.str("");
578 // else grab corresponding writer
579 else writer = (*writerIter).second;
581 // store alignment in proper BAM output file
583 writer->SaveAlignment(al);
586 // clean up BamWriters
587 map<float, BamWriter*>::iterator writerEnd = outputFiles.end();
588 for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
589 BamWriter* writer = (*writerIter).second;
590 if (writer == 0 ) continue;
600 bool SplitTool::SplitToolPrivate::SplitTag_String(BamAlignment& al) {
602 // set up splitting data structure
603 map<string, BamWriter*> outputFiles;
604 map<string, BamWriter*>::iterator writerIter;
607 const string tag = m_settings->TagToSplit;
609 stringstream outputFilenameStream("");
612 // retrieve first alignment tag value
613 if ( al.GetTag(tag, currentValue) ) {
615 // open new BamWriter, save first alignment
616 writer = new BamWriter;
617 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
618 writer->Open(outputFilenameStream.str(), m_header, m_references);
619 writer->SaveAlignment(al);
622 outputFiles.insert( make_pair(currentValue, writer) );
625 outputFilenameStream.str("");
628 // iterate through remaining alignments
629 while ( m_reader.GetNextAlignment(al) ) {
631 // skip if this alignment doesn't have TAG
632 if ( !al.GetTag(tag, currentValue) ) continue;
634 // look up tag value in map
635 writerIter = outputFiles.find(currentValue);
637 // if no writer associated with this value
638 if ( writerIter == outputFiles.end() ) {
640 // open new BamWriter
641 writer = new BamWriter;
642 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
643 writer->Open(outputFilenameStream.str(), m_header, m_references);
646 outputFiles.insert( make_pair(currentValue, writer) );
649 outputFilenameStream.str("");
652 // else grab corresponding writer
653 else writer = (*writerIter).second;
655 // store alignment in proper BAM output file
657 writer->SaveAlignment(al);
660 // clean up BamWriters
661 map<string, BamWriter*>::iterator writerEnd = outputFiles.end();
662 for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
663 BamWriter* writer = (*writerIter).second;
664 if (writer == 0 ) continue;
674 // ---------------------------------------------
675 // SplitTool implementation
677 SplitTool::SplitTool(void)
679 , m_settings(new SplitSettings)
682 // set program details
683 Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename>] < -mapped | -paired | -reference | -tag <TAG> > ");
686 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
687 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
688 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
690 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
691 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
692 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
693 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
694 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
695 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
698 SplitTool::~SplitTool(void) {
707 int SplitTool::Help(void) {
708 Options::DisplayHelp();
712 int SplitTool::Run(int argc, char* argv[]) {
714 // parse command line arguments
715 Options::Parse(argc, argv, 1);
717 // initialize internal implementation
718 m_impl = new SplitToolPrivate(m_settings);
720 // run tool, return success/fail