]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_split.cpp
Added implementation of new SplitTool. This tool splits a single BAM file into multi...
[bamtools.git] / src / toolkit / bamtools_split.cpp
1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 19 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // 
9 // ***************************************************************************
10
11 #include <ctime>
12 #include <iostream>
13 #include <map>
14 #include <sstream>
15 #include <string>
16 #include <vector>
17 #include "bamtools_split.h"
18 #include "bamtools_options.h"
19 #include "bamtools_variant.h"
20 #include "BamReader.h"
21 #include "BamWriter.h"
22 using namespace std;
23 using namespace BamTools;
24
25 namespace BamTools {
26   
27     // string constants
28     static const string SPLIT_MAPPED_TOKEN    = ".MAPPED";
29     static const string SPLIT_UNMAPPED_TOKEN  = ".UNMAPPED";
30     static const string SPLIT_PAIRED_TOKEN    = ".PAIRED_END";
31     static const string SPLIT_SINGLE_TOKEN    = ".SINGLE_END";
32     static const string SPLIT_REFERENCE_TOKEN = ".REF_";
33
34     string GetTimestampString(void) {
35       
36         // get human readable timestamp
37         time_t currentTime;
38         time(&currentTime);
39         stringstream timeStream("");
40         timeStream << ctime(&currentTime);
41         
42         // convert whitespace to '_'
43         string timeString = timeStream.str();
44         size_t found = timeString.find(" ");
45         while (found != string::npos) {
46             timeString.replace(found, 1, "_");
47             found = timeString.find(" ", found+1);
48         }
49         return timeString;
50     }
51     
52     // remove copy of filename without extension 
53     // (so /path/to/file.txt becomes /path/to/file )
54     string RemoveFilenameExtension(const string& filename) {
55         size_t found = filename.rfind(".");
56         return filename.substr(0, found);
57     }
58     
59 } // namespace BamTools
60
61 // ---------------------------------------------
62 // SplitSettings implementation
63
64 struct SplitTool::SplitSettings {
65
66     // flags
67     bool HasInputFilename;
68     bool HasCustomOutputStub;
69     bool IsSplittingMapped;
70     bool IsSplittingPaired;
71     bool IsSplittingReference;
72     bool IsSplittingTag;
73     
74     // string args
75     string CustomOutputStub;
76     string InputFilename;
77     string TagToSplit;
78     
79     // constructor
80     SplitSettings(void)
81         : HasInputFilename(false)
82         , HasCustomOutputStub(false)
83         , IsSplittingMapped(false)
84         , IsSplittingPaired(false)
85         , IsSplittingReference(false)
86         , IsSplittingTag(false)
87         , CustomOutputStub("")
88         , InputFilename(Options::StandardIn())
89         , TagToSplit("")
90     { } 
91 };  
92
93 // ---------------------------------------------
94 // SplitToolPrivate declaration
95
96 class SplitTool::SplitToolPrivate {
97       
98     // ctor & dtor
99     public:
100         SplitToolPrivate(SplitTool::SplitSettings* settings);
101         ~SplitToolPrivate(void);
102         
103     // 'public' interface
104     public:
105         bool Run(void);
106         
107     // internal methods
108     private:
109         void DetermineOutputFilenameStub(void);
110         bool OpenReader(void);
111         bool SplitMapped(void);
112         bool SplitPaired(void);
113         bool SplitReference(void);
114         bool SplitTag(void);
115         bool SplitTag_Int(BamAlignment& al);
116         bool SplitTag_UInt(BamAlignment& al);
117         bool SplitTag_Real(BamAlignment& al);
118         bool SplitTag_String(BamAlignment& al);
119         
120     // data members
121     private:
122         SplitTool::SplitSettings* m_settings;
123         string m_outputFilenameStub;
124         BamReader m_reader;
125         string m_header;
126         RefVector m_references;
127 };
128
129 // constructor
130 SplitTool::SplitToolPrivate::SplitToolPrivate(SplitTool::SplitSettings* settings) 
131     : m_settings(settings)
132 { }
133
134 // destructor
135 SplitTool::SplitToolPrivate::~SplitToolPrivate(void) { 
136     m_reader.Close();
137 }
138
139 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
140   
141     // if user supplied output filename stub, use that
142     if ( m_settings->HasCustomOutputStub ) 
143         m_outputFilenameStub = m_settings->CustomOutputStub;
144     
145     // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
146     else if ( m_settings->HasInputFilename )
147         m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
148         
149     // otherwise, user did not specify -stub, and input is coming from STDIN
150     // generate stub from timestamp
151     else m_outputFilenameStub = GetTimestampString();      
152 }
153
154 bool SplitTool::SplitToolPrivate::OpenReader(void) {
155     if ( !m_reader.Open(m_settings->InputFilename) ) {
156         cerr << "ERROR: SplitTool could not open BAM file: " << m_settings->InputFilename << endl;
157         return false;
158     }
159     m_header     = m_reader.GetHeaderText();
160     m_references = m_reader.GetReferenceData();
161     return true;
162 }
163
164 bool SplitTool::SplitToolPrivate::Run(void) {
165   
166     // determine output stub
167     DetermineOutputFilenameStub();
168
169     // open up BamReader
170     if ( !OpenReader() ) return false;
171     
172     // determine split type from settings
173     if ( m_settings->IsSplittingMapped )    return SplitMapped();
174     if ( m_settings->IsSplittingPaired )    return SplitPaired();
175     if ( m_settings->IsSplittingReference ) return SplitReference();
176     if ( m_settings->IsSplittingTag )       return SplitTag();
177
178     // if we get here, no property was specified 
179     cerr << "No property given to split on... Please use -mapped, -paired, -reference, or -tag TAG to specifiy split behavior." << endl;
180     return false;
181 }    
182
183 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
184     
185     // set up splitting data structure
186     map<bool, BamWriter*> outputFiles;
187     map<bool, BamWriter*>::iterator writerIter;
188     
189     // iterate through alignments
190     BamAlignment al;
191     BamWriter* writer;
192     bool isCurrentAlignmentMapped;
193     while ( m_reader.GetNextAlignment(al) ) {
194       
195         // see if bool value exists
196         isCurrentAlignmentMapped = al.IsMapped();
197         writerIter = outputFiles.find(isCurrentAlignmentMapped);
198           
199         // if no writer associated with this value
200         if ( writerIter == outputFiles.end() ) {
201         
202             // open new BamWriter
203             writer = new BamWriter;
204             const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN ) + ".bam";
205             writer->Open(outputFilename, m_header, m_references);
206           
207             // store in map
208             outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
209         } 
210         
211         // else grab corresponding writer
212         else writer = (*writerIter).second;
213         
214         // store alignment in proper BAM output file 
215         if ( writer ) 
216             writer->SaveAlignment(al);
217     }
218     
219     // clean up BamWriters 
220     map<bool, BamWriter*>::iterator writerEnd  = outputFiles.end();
221     for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) {
222         BamWriter* writer = (*writerIter).second;
223         if ( writer == 0 ) continue;
224         writer->Close();
225         delete writer;
226         writer = 0;
227     }
228     
229     // return success
230     return true;
231 }
232
233 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
234   
235     // set up splitting data structure
236     map<bool, BamWriter*> outputFiles;
237     map<bool, BamWriter*>::iterator writerIter;
238     
239     // iterate through alignments
240     BamAlignment al;
241     BamWriter* writer;
242     bool isCurrentAlignmentPaired;
243     while ( m_reader.GetNextAlignment(al) ) {
244       
245         // see if bool value exists
246         isCurrentAlignmentPaired = al.IsPaired();
247         writerIter = outputFiles.find(isCurrentAlignmentPaired);
248           
249         // if no writer associated with this value
250         if ( writerIter == outputFiles.end() ) {
251         
252             // open new BamWriter
253             writer = new BamWriter;
254             const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN ) + ".bam";
255             writer->Open(outputFilename, m_header, m_references);
256           
257             // store in map
258             outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
259         } 
260         
261         // else grab corresponding writer
262         else writer = (*writerIter).second;
263         
264         // store alignment in proper BAM output file 
265         if ( writer ) 
266             writer->SaveAlignment(al);
267     }
268     
269     // clean up BamWriters 
270     map<bool, BamWriter*>::iterator writerEnd  = outputFiles.end();
271     for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) {
272         BamWriter* writer = (*writerIter).second;
273         if (writer == 0 ) continue;
274         writer->Close();
275         delete writer;
276         writer = 0;
277     }
278     
279     // return success
280     return true;  
281 }
282
283 bool SplitTool::SplitToolPrivate::SplitReference(void) {
284   
285     // set up splitting data structure
286     map<int32_t, BamWriter*> outputFiles;
287     map<int32_t, BamWriter*>::iterator writerIter;
288     
289     // iterate through alignments
290     BamAlignment al;
291     BamWriter* writer;
292     int32_t currentRefId;
293     while ( m_reader.GetNextAlignment(al) ) {
294       
295         // see if bool value exists
296         currentRefId = al.RefID;
297         writerIter = outputFiles.find(currentRefId);
298           
299         // if no writer associated with this value
300         if ( writerIter == outputFiles.end() ) {
301         
302             // open new BamWriter
303             writer = new BamWriter;
304             const string refName = m_references.at(currentRefId).RefName;
305             const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
306             writer->Open(outputFilename, m_header, m_references);
307           
308             // store in map
309             outputFiles.insert( make_pair(currentRefId, writer) );
310         } 
311         
312         // else grab corresponding writer
313         else writer = (*writerIter).second;
314         
315         // store alignment in proper BAM output file 
316         if ( writer ) 
317             writer->SaveAlignment(al);
318     }
319     
320     // clean up BamWriters 
321     map<int32_t, BamWriter*>::iterator writerEnd  = outputFiles.end();
322     for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
323         BamWriter* writer = (*writerIter).second;
324         if (writer == 0 ) continue;
325         writer->Close();
326         delete writer;
327         writer = 0;
328     }
329     
330     // return success
331     return true;
332 }
333
334 bool SplitTool::SplitToolPrivate::SplitTag(void) {  
335   
336     // iterate through alignments, until we hit TAG
337     BamAlignment al;
338     while ( m_reader.GetNextAlignment(al) ) {
339       
340         // look for tag in this alignment and get tag type
341         char tagType(0);
342         if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) continue;
343         
344         // request split method based on tag type
345         // pass it the current alignment found
346         switch (tagType) {
347           
348             case 'c' :
349             case 's' : 
350             case 'i' :
351                 return SplitTag_Int(al);
352                 
353             case 'C' :
354             case 'S' :
355             case 'I' : 
356                 return SplitTag_UInt(al);
357               
358             case 'f' :
359                 return SplitTag_Real(al);
360             
361             case 'A':
362             case 'Z':
363             case 'H':
364                 return SplitTag_String(al);
365           
366             default:
367                 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", tagType);
368                 return false;
369         }
370     }
371     
372     // tag not found, but that's not an error - return success
373     return true;
374 }
375
376 bool SplitTool::SplitToolPrivate::SplitTag_Int(BamAlignment& al) {
377   
378     // set up splitting data structure
379     map<int32_t, BamWriter*> outputFiles;
380     map<int32_t, BamWriter*>::iterator writerIter;
381
382     // local variables
383     const string tag = m_settings->TagToSplit;
384     BamWriter* writer;
385     stringstream outputFilenameStream("");
386     int32_t currentValue;
387     
388     // retrieve first alignment tag value
389     if ( al.GetTag(tag, currentValue) ) {
390       
391         // open new BamWriter, save first alignment
392         writer = new BamWriter;
393         outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
394         writer->Open(outputFilenameStream.str(), m_header, m_references);
395         writer->SaveAlignment(al);
396         
397         // store in map
398         outputFiles.insert( make_pair(currentValue, writer) );
399         
400         // reset stream
401         outputFilenameStream.str("");
402     }
403     
404     // iterate through remaining alignments
405     while ( m_reader.GetNextAlignment(al) ) {
406       
407         // skip if this alignment doesn't have TAG 
408         if ( !al.GetTag(tag, currentValue) ) continue;
409         
410         // look up tag value in map
411         writerIter = outputFiles.find(currentValue);
412           
413         // if no writer associated with this value
414         if ( writerIter == outputFiles.end() ) {
415         
416             // open new BamWriter
417             writer = new BamWriter;
418             outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
419             writer->Open(outputFilenameStream.str(), m_header, m_references);
420             
421             // store in map
422             outputFiles.insert( make_pair(currentValue, writer) );
423             
424             // reset stream
425             outputFilenameStream.str("");
426         } 
427         
428         // else grab corresponding writer
429         else writer = (*writerIter).second;
430         
431         // store alignment in proper BAM output file 
432         if ( writer ) 
433             writer->SaveAlignment(al);
434     }
435     
436     // clean up BamWriters 
437     map<int32_t, BamWriter*>::iterator writerEnd  = outputFiles.end();
438     for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
439         BamWriter* writer = (*writerIter).second;
440         if (writer == 0 ) continue;
441         writer->Close();
442         delete writer;
443         writer = 0;
444     }
445     
446     // return success
447     return true;
448 }
449
450 bool SplitTool::SplitToolPrivate::SplitTag_UInt(BamAlignment& al) {
451   
452     // set up splitting data structure
453     map<uint32_t, BamWriter*> outputFiles;
454     map<uint32_t, BamWriter*>::iterator writerIter;
455
456     // local variables
457     const string tag = m_settings->TagToSplit;
458     BamWriter* writer;
459     stringstream outputFilenameStream("");
460     uint32_t currentValue;
461     
462     int alignmentsIgnored = 0;
463     
464     // retrieve first alignment tag value
465     if ( al.GetTag(tag, currentValue) ) {
466       
467         // open new BamWriter, save first alignment
468         writer = new BamWriter;
469         outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
470         writer->Open(outputFilenameStream.str(), m_header, m_references);
471         writer->SaveAlignment(al);
472         
473         // store in map
474         outputFiles.insert( make_pair(currentValue, writer) );
475         
476         // reset stream
477         outputFilenameStream.str("");
478     } else ++alignmentsIgnored;
479     
480     // iterate through remaining alignments
481     while ( m_reader.GetNextAlignment(al) ) {
482       
483         // skip if this alignment doesn't have TAG 
484         if ( !al.GetTag(tag, currentValue) ) { ++alignmentsIgnored; continue; }
485         
486         // look up tag value in map
487         writerIter = outputFiles.find(currentValue);
488           
489         // if no writer associated with this value
490         if ( writerIter == outputFiles.end() ) {
491         
492             // open new BamWriter
493             writer = new BamWriter;
494             outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
495             writer->Open(outputFilenameStream.str(), m_header, m_references);
496             
497             // store in map
498             outputFiles.insert( make_pair(currentValue, writer) );
499             
500             // reset stream
501             outputFilenameStream.str("");
502         } 
503         
504         // else grab corresponding writer
505         else writer = (*writerIter).second;
506         
507         // store alignment in proper BAM output file 
508         if ( writer ) 
509             writer->SaveAlignment(al);
510     }
511     
512     // clean up BamWriters 
513     map<uint32_t, BamWriter*>::iterator writerEnd  = outputFiles.end();
514     for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
515         BamWriter* writer = (*writerIter).second;
516         if (writer == 0 ) continue;
517         writer->Close();
518         delete writer;
519         writer = 0;
520     }
521     
522     // return success
523     return true;
524 }
525
526 bool SplitTool::SplitToolPrivate::SplitTag_Real(BamAlignment& al) {
527
528      // set up splitting data structure
529     map<float, BamWriter*> outputFiles;
530     map<float, BamWriter*>::iterator writerIter;
531
532     // local variables
533     const string tag = m_settings->TagToSplit;
534     BamWriter* writer;
535     stringstream outputFilenameStream("");
536     float currentValue;
537     
538     // retrieve first alignment tag value
539     if ( al.GetTag(tag, currentValue) ) {
540       
541         // open new BamWriter, save first alignment
542         writer = new BamWriter;
543         outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
544         writer->Open(outputFilenameStream.str(), m_header, m_references);
545         writer->SaveAlignment(al);
546         
547         // store in map
548         outputFiles.insert( make_pair(currentValue, writer) );
549         
550         // reset stream
551         outputFilenameStream.str("");
552     }
553     
554     // iterate through remaining alignments
555     while ( m_reader.GetNextAlignment(al) ) {
556       
557         // skip if this alignment doesn't have TAG 
558         if ( !al.GetTag(tag, currentValue) ) continue;
559         
560         // look up tag value in map
561         writerIter = outputFiles.find(currentValue);
562           
563         // if no writer associated with this value
564         if ( writerIter == outputFiles.end() ) {
565         
566             // open new BamWriter
567             writer = new BamWriter;
568             outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
569             writer->Open(outputFilenameStream.str(), m_header, m_references);
570             
571             // store in map
572             outputFiles.insert( make_pair(currentValue, writer) );
573             
574             // reset stream
575             outputFilenameStream.str("");
576         } 
577         
578         // else grab corresponding writer
579         else writer = (*writerIter).second;
580         
581         // store alignment in proper BAM output file 
582         if ( writer ) 
583             writer->SaveAlignment(al);
584     }
585     
586     // clean up BamWriters 
587     map<float, BamWriter*>::iterator writerEnd  = outputFiles.end();
588     for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
589         BamWriter* writer = (*writerIter).second;
590         if (writer == 0 ) continue;
591         writer->Close();
592         delete writer;
593         writer = 0;
594     }
595     
596     // return success
597     return true;
598 }
599
600 bool SplitTool::SplitToolPrivate::SplitTag_String(BamAlignment& al) {
601   
602      // set up splitting data structure
603     map<string, BamWriter*> outputFiles;
604     map<string, BamWriter*>::iterator writerIter;
605
606     // local variables
607     const string tag = m_settings->TagToSplit;
608     BamWriter* writer;
609     stringstream outputFilenameStream("");
610     string currentValue;
611     
612     // retrieve first alignment tag value
613     if ( al.GetTag(tag, currentValue) ) {
614       
615         // open new BamWriter, save first alignment
616         writer = new BamWriter;
617         outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
618         writer->Open(outputFilenameStream.str(), m_header, m_references);
619         writer->SaveAlignment(al);
620         
621         // store in map
622         outputFiles.insert( make_pair(currentValue, writer) );
623         
624         // reset stream
625         outputFilenameStream.str("");
626     }
627     
628     // iterate through remaining alignments
629     while ( m_reader.GetNextAlignment(al) ) {
630       
631         // skip if this alignment doesn't have TAG 
632         if ( !al.GetTag(tag, currentValue) ) continue;
633         
634         // look up tag value in map
635         writerIter = outputFiles.find(currentValue);
636           
637         // if no writer associated with this value
638         if ( writerIter == outputFiles.end() ) {
639         
640             // open new BamWriter
641             writer = new BamWriter;
642             outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
643             writer->Open(outputFilenameStream.str(), m_header, m_references);
644             
645             // store in map
646             outputFiles.insert( make_pair(currentValue, writer) );
647             
648             // reset stream
649             outputFilenameStream.str("");
650         } 
651         
652         // else grab corresponding writer
653         else writer = (*writerIter).second;
654         
655         // store alignment in proper BAM output file 
656         if ( writer ) 
657             writer->SaveAlignment(al);
658     }
659     
660     // clean up BamWriters 
661     map<string, BamWriter*>::iterator writerEnd  = outputFiles.end();
662     for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
663         BamWriter* writer = (*writerIter).second;
664         if (writer == 0 ) continue;
665         writer->Close();
666         delete writer;
667         writer = 0;
668     }
669     
670     // return success
671     return true;
672 }
673
674 // ---------------------------------------------
675 // SplitTool implementation
676
677 SplitTool::SplitTool(void)
678     : AbstractTool()
679     , m_settings(new SplitSettings)
680     , m_impl(0)
681 {
682     // set program details
683     Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename>] < -mapped | -paired | -reference | -tag <TAG> > ");
684     
685     // set up options 
686     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
687     Options::AddValueOption("-in",  "BAM filename", "the input BAM file",  "", m_settings->HasInputFilename,  m_settings->InputFilename,  IO_Opts, Options::StandardIn());
688     Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
689     
690     OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
691     Options::AddOption("-mapped",    "split mapped/unmapped alignments",       m_settings->IsSplittingMapped,    SplitOpts);
692     Options::AddOption("-paired",    "split single-end/paired-end alignments", m_settings->IsSplittingPaired,    SplitOpts);
693     Options::AddOption("-reference", "split alignments by reference",          m_settings->IsSplittingReference, SplitOpts);
694     Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "", 
695                             m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
696 }
697
698 SplitTool::~SplitTool(void) {
699     
700     delete m_settings;
701     m_settings = 0;
702     
703     delete m_impl;
704     m_impl = 0;
705 }
706
707 int SplitTool::Help(void) {
708     Options::DisplayHelp();
709     return 0;
710 }
711
712 int SplitTool::Run(int argc, char* argv[]) {
713   
714     // parse command line arguments
715     Options::Parse(argc, argv, 1);
716     
717     // initialize internal implementation
718     m_impl = new SplitToolPrivate(m_settings);
719     
720     // run tool, return success/fail
721     if ( m_impl->Run() ) 
722         return 0;
723     else 
724         return 1;
725 }