]> git.donarmstrong.com Git - bamtools.git/blobdiff - src/toolkit/bamtools_convert.cpp
Reorganized source tree & build system
[bamtools.git] / src / toolkit / bamtools_convert.cpp
diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp
new file mode 100644 (file)
index 0000000..1c5ba48
--- /dev/null
@@ -0,0 +1,619 @@
+// ***************************************************************************
+// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 July 2010
+// ---------------------------------------------------------------------------
+// Converts between BAM and a number of other formats
+// ***************************************************************************
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "bamtools_convert.h"
+#include "bamtools_options.h"
+#include "bamtools_pileup.h"
+#include "bamtools_utilities.h"
+#include "BGZF.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
+
+using namespace std;
+using namespace BamTools;
+  
+namespace BamTools { 
+    // format names
+    static const string FORMAT_BED      = "bed";
+    static const string FORMAT_BEDGRAPH = "bedgraph";
+    static const string FORMAT_FASTA    = "fasta";
+    static const string FORMAT_FASTQ    = "fastq";
+    static const string FORMAT_JSON     = "json";
+    static const string FORMAT_SAM      = "sam";
+    static const string FORMAT_PILEUP   = "pileup";
+    static const string FORMAT_WIGGLE   = "wig";
+
+    // other constants
+    static const unsigned int FASTA_LINE_MAX = 50;
+    
+} // namespace BamTools
+  
+struct ConvertTool::ConvertToolPrivate {
+  
+    // ctor & dtor
+    public:
+        ConvertToolPrivate(ConvertTool::ConvertSettings* settings);
+        ~ConvertToolPrivate(void);
+    
+    // interface
+    public:
+        bool Run(void);
+    
+    // internal methods
+    private:
+        void PrintBed(const BamAlignment& a);
+        void PrintBedGraph(const BamAlignment& a);
+        void PrintFasta(const BamAlignment& a);
+        void PrintFastq(const BamAlignment& a);
+        void PrintJson(const BamAlignment& a);
+        void PrintSam(const BamAlignment& a);
+        void PrintWiggle(const BamAlignment& a);
+        
+    // data members
+    private: 
+        ConvertTool::ConvertSettings* m_settings;
+        RefVector m_references;
+        ostream m_out;
+};
+
+// ---------------------------------------------
+// ConvertSettings implementation
+
+struct ConvertTool::ConvertSettings {
+
+    // flags
+    bool HasInput;
+    bool HasOutput;
+    bool HasFormat;
+    bool HasRegion;
+
+    // pileup flags
+    bool HasFastaFilename;
+    bool IsOmittingSamHeader;
+    bool IsPrintingPileupMapQualities;
+    
+    // options
+    vector<string> InputFiles;
+    string OutputFilename;
+    string Format;
+    string Region;
+    
+    // pileup options
+    string FastaFilename;
+
+    // constructor
+    ConvertSettings(void)
+        : HasInput(false)
+        , HasOutput(false)
+        , HasFormat(false)
+        , HasRegion(false)
+        , HasFastaFilename(false)
+        , IsOmittingSamHeader(false)
+        , IsPrintingPileupMapQualities(false)
+        , OutputFilename(Options::StandardOut())
+    { } 
+};  
+
+// ---------------------------------------------
+// ConvertTool implementation
+
+ConvertTool::ConvertTool(void)
+    : AbstractTool()
+    , m_settings(new ConvertSettings)
+    , m_impl(0)
+{
+    // set program details
+    Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format <FORMAT> [-in <filename> -in <filename> ...] [-out <filename>] [other options]");
+    
+    // set up options 
+    OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+    Options::AddValueOption("-in",     "BAM filename", "the input BAM file(s)", "", m_settings->HasInput,   m_settings->InputFiles,     IO_Opts, Options::StandardIn());
+    Options::AddValueOption("-out",    "BAM filename", "the output BAM file",   "", m_settings->HasOutput,  m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+    Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
+   
+    OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
+    Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more  details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+    
+    OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options");
+    Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, "");
+    Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts);
+    
+    OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options");
+    Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts);
+}
+
+ConvertTool::~ConvertTool(void) {
+    delete m_settings;
+    m_settings = 0;
+    
+    delete m_impl;
+    m_impl = 0;
+}
+
+int ConvertTool::Help(void) {
+    Options::DisplayHelp();
+    return 0;
+}
+
+int ConvertTool::Run(int argc, char* argv[]) {
+  
+    // parse command line arguments
+    Options::Parse(argc, argv, 1);
+    
+    // run internal ConvertTool implementation, return success/fail
+    m_impl = new ConvertToolPrivate(m_settings);
+    
+    if ( m_impl->Run() ) 
+        return 0;
+    else 
+        return 1;
+}
+
+// ---------------------------------------------
+// ConvertToolPrivate implementation
+
+ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
+    : m_settings(settings)
+    , m_out(cout.rdbuf()) // default output to cout
+{ }
+
+ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
+
+bool ConvertTool::ConvertToolPrivate::Run(void) {
+    bool convertedOk = true;
+  
+    // ------------------------------------
+    // initialize conversion input/output
+        
+    // set to default input if none provided
+    if ( !m_settings->HasInput ) 
+        m_settings->InputFiles.push_back(Options::StandardIn());
+    
+    // open input files
+    BamMultiReader reader;
+    reader.Open(m_settings->InputFiles, false);
+    m_references = reader.GetReferenceData();
+
+    // set region if specified
+    BamRegion region;
+    if ( m_settings->HasRegion ) {
+        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
+            if ( !reader.SetRegion(region) )
+              cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
+        }
+    }
+        
+    // if output file given
+    ofstream outFile;
+    if ( m_settings->HasOutput ) {
+      
+        // open output file stream
+        outFile.open(m_settings->OutputFilename.c_str());
+        if ( !outFile ) {
+            cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; 
+            return false; 
+        }
+        
+        // set m_out to file's streambuf
+        m_out.rdbuf(outFile.rdbuf()); 
+    }
+    
+    // ------------------------
+    // pileup is special case
+    if ( m_settings->Format == FORMAT_PILEUP ) {
+        
+        // initialize pileup input/output
+        Pileup pileup(&reader, &m_out);
+        
+        // ---------------------------
+        // configure pileup settings
+        
+        if ( m_settings->HasRegion ) 
+            pileup.SetRegion(region);
+        
+        if ( m_settings->HasFastaFilename ) 
+            pileup.SetFastaFilename(m_settings->FastaFilename);
+        
+        pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities );
+        
+        // run pileup
+        convertedOk = pileup.Run();
+    }
+    
+    // -------------------------------------
+    // else determine 'simpler' format type
+    else {
+    
+        bool formatError = false;
+        void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
+        if      ( m_settings->Format == FORMAT_BED )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
+        else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
+        else if ( m_settings->Format == FORMAT_FASTA )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
+        else if ( m_settings->Format == FORMAT_FASTQ )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
+        else if ( m_settings->Format == FORMAT_JSON )     pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
+        else if ( m_settings->Format == FORMAT_SAM )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
+        else if ( m_settings->Format == FORMAT_WIGGLE )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
+        else { 
+            cerr << "Unrecognized format: " << m_settings->Format << endl;
+            cerr << "Please see help|README (?) for details on supported formats " << endl;
+            formatError = true;
+            convertedOk = false;
+        }
+        
+        // if SAM format & not omitting header, print SAM header
+        if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) {
+            string headerText = reader.GetHeaderText();
+            m_out << headerText;
+        }
+        
+        // ------------------------
+        // do conversion
+        if ( !formatError ) {
+            BamAlignment a;
+            while ( reader.GetNextAlignment(a) ) {
+                (this->*pFunction)(a);
+            }
+        }
+    }
+    
+    // ------------------------
+    // clean up & exit
+    reader.Close();
+    if ( m_settings->HasOutput ) outFile.close();
+    return convertedOk;   
+}
+
+// ----------------------------------------------------------
+// Conversion/output methods
+// ----------------------------------------------------------
+
+void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a)      { 
+  
+    // tab-delimited, 0-based half-open 
+    // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
+    // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>
+
+    m_out << m_references.at(a.RefID).RefName << "\t"
+          << a.Position << "\t"
+          << a.GetEndPosition() + 1 << "\t"
+          << a.Name << "\t"
+          << a.MapQuality << "\t"
+          << (a.IsReverseStrand() ? "-" : "+") << endl;
+}
+
+void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) { 
+    ; 
+}
+
+// print BamAlignment in FASTA format
+// N.B. - uses QueryBases NOT AlignedBases
+void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { 
+    
+    // >BamAlignment.Name
+    // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
+    // ...
+  
+    // print header
+    m_out << "> " << a.Name << endl;
+    
+    // if sequence fits on single line
+    if ( a.QueryBases.length() <= FASTA_LINE_MAX )
+        m_out << a.QueryBases << endl;
+    
+    // else split over multiple lines
+    else {
+      
+        size_t position = 0;
+        size_t seqLength = a.QueryBases.length();
+        
+        // write subsequences to each line
+        while ( position < (seqLength - FASTA_LINE_MAX) ) {
+            m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl;
+            position += FASTA_LINE_MAX;
+        }
+        
+        // write final subsequence
+        m_out << a.QueryBases.substr(position) << endl;
+    }
+}
+
+// print BamAlignment in FASTQ format
+// N.B. - uses QueryBases NOT AlignedBases
+void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { 
+  
+    // @BamAlignment.Name
+    // BamAlignment.QueryBases
+    // +
+    // BamAlignment.Qualities
+  
+    m_out << "@" << a.Name << endl
+          << a.QueryBases  << endl
+          << "+"           << endl
+          << a.Qualities   << endl;
+}
+
+// print BamAlignment in JSON format
+void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) {
+  
+    // write name & alignment flag
+    m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
+    
+    // write reference name
+    if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) 
+        m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
+    
+    // write position & map quality
+    m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
+    
+    // write CIGAR
+    const vector<CigarOp>& cigarData = a.CigarData;
+    if ( !cigarData.empty() ) {
+        m_out << "\"cigar\":[";
+        vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
+        vector<CigarOp>::const_iterator cigarIter = cigarBegin;
+        vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
+        for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+            const CigarOp& op = (*cigarIter);
+            if (cigarIter != cigarBegin) m_out << ",";
+            m_out << "\"" << op.Length << op.Type << "\"";
+        }
+        m_out << "],";
+    }
+    
+    // write mate reference name, mate position, & insert size
+    if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
+        m_out << "\"mate\":{"
+              << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
+              << "\"position\":" << a.MatePosition+1
+              << ",\"insertSize\":" << a.InsertSize << "},";
+    }
+    
+    // write sequence
+    if ( !a.QueryBases.empty() ) 
+        m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
+    
+    // write qualities
+    if ( !a.Qualities.empty() ) {
+        string::const_iterator s = a.Qualities.begin();
+        m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
+        ++s;
+        for (; s != a.Qualities.end(); ++s) {
+            m_out << "," << static_cast<short>(*s) - 33;
+        }
+        m_out << "],";
+    }
+    
+    // write tag data
+    const char* tagData = a.TagData.c_str();
+    const size_t tagDataLength = a.TagData.length();
+    size_t index = 0;
+    if (index < tagDataLength) {
+
+        m_out << "\"tags\":{";
+        
+        while ( index < tagDataLength ) {
+
+            if (index > 0)
+                m_out << ",";
+            
+            // write tag name
+            m_out << "\"" << a.TagData.substr(index, 2) << "\":";
+            index += 2;
+            
+            // get data type
+            char type = a.TagData.at(index);
+            ++index;
+            
+            switch (type) {
+                case('A') : 
+                    m_out << "\"" << tagData[index] << "\"";
+                    ++index; 
+                    break;
+                
+                case('C') : 
+                    m_out << (int)tagData[index]; 
+                    ++index; 
+                    break;
+                
+                case('c') : 
+                    m_out << (int)tagData[index];
+                    ++index; 
+                    break;
+                
+                case('S') : 
+                    m_out << BgzfData::UnpackUnsignedShort(&tagData[index]); 
+                    index += 2; 
+                    break;
+                    
+                case('s') : 
+                    m_out << BgzfData::UnpackSignedShort(&tagData[index]);
+                    index += 2; 
+                    break;
+                
+                case('I') : 
+                    m_out << BgzfData::UnpackUnsignedInt(&tagData[index]);
+                    index += 4; 
+                    break;
+                
+                case('i') : 
+                    m_out << BgzfData::UnpackSignedInt(&tagData[index]);
+                    index += 4; 
+                    break;
+                
+                case('f') : 
+                    m_out << BgzfData::UnpackFloat(&tagData[index]);
+                    index += 4; 
+                    break;
+                
+                case('d') : 
+                    m_out << BgzfData::UnpackDouble(&tagData[index]);
+                    index += 8; 
+                    break;
+                
+                case('Z') :
+                case('H') : 
+                    m_out << "\""; 
+                    while (tagData[index]) {
+                        m_out << tagData[index];
+                        ++index;
+                    }
+                    m_out << "\""; 
+                    ++index; 
+                    break;      
+            }
+            
+            if ( tagData[index] == '\0') 
+                break;
+        }
+
+        m_out << "}";
+    }
+
+    m_out << "}" << endl;
+    
+}
+
+// print BamAlignment in SAM format
+void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) {
+  
+    // tab-delimited
+    // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
+  
+    // write name & alignment flag
+    m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
+    
+    // write reference name
+    if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) 
+        m_out << m_references[a.RefID].RefName << "\t";
+    else 
+        m_out << "*\t";
+    
+    // write position & map quality
+    m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
+    
+    // write CIGAR
+    const vector<CigarOp>& cigarData = a.CigarData;
+    if ( cigarData.empty() ) m_out << "*\t";
+    else {
+        vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
+        vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
+        for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+            const CigarOp& op = (*cigarIter);
+            m_out << op.Length << op.Type;
+        }
+        m_out << "\t";
+    }
+    
+    // write mate reference name, mate position, & insert size
+    if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
+        if ( a.MateRefID == a.RefID ) m_out << "=\t";
+        else m_out << m_references[a.MateRefID].RefName << "\t";
+        m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
+    } 
+    else m_out << "*\t0\t0\t";
+    
+    // write sequence
+    if ( a.QueryBases.empty() ) m_out << "*\t";
+    else m_out << a.QueryBases << "\t";
+    
+    // write qualities
+    if ( a.Qualities.empty() ) m_out << "*";
+    else m_out << a.Qualities;
+    
+    // write tag data
+    const char* tagData = a.TagData.c_str();
+    const size_t tagDataLength = a.TagData.length();
+    
+    size_t index = 0;
+    while ( index < tagDataLength ) {
+
+        // write tag name   
+        string tagName = a.TagData.substr(index, 2);
+        m_out << "\t" << tagName << ":";
+        index += 2;
+        
+        // get data type
+        char type = a.TagData.at(index);
+        ++index;
+        switch (type) {
+            case('A') : 
+                m_out << "A:" << tagData[index]; 
+                ++index; 
+                break;
+            
+            case('C') : 
+                m_out << "i:" << (int)tagData[index]; 
+                ++index; 
+                break;
+            
+            case('c') : 
+                m_out << "i:" << (int)tagData[index];
+                ++index; 
+                break;
+            
+            case('S') : 
+                m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
+                index += 2; 
+                break;
+                
+            case('s') : 
+                m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
+                index += 2; 
+                break;
+            
+            case('I') :
+                m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
+                index += 4; 
+                break;
+            
+            case('i') : 
+                m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
+                index += 4; 
+                break;
+            
+            case('f') : 
+                m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
+                index += 4; 
+                break;
+            
+            case('d') : 
+                m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
+                index += 8; 
+                break;
+            
+            case('Z') :
+            case('H') : 
+                m_out << type << ":";
+                while (tagData[index]) {
+                    m_out << tagData[index];
+                    ++index;
+                }
+                ++index; 
+                break;      
+        }
+
+        if ( tagData[index] == '\0') 
+            break;
+    }
+
+    m_out << endl;
+}
+
+void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) { 
+    ; 
+}