]> git.donarmstrong.com Git - bamtools.git/commitdiff
Begun basic pileup conversion
authorDerek <derekwbarnett@gmail.com>
Tue, 13 Jul 2010 01:53:57 +0000 (21:53 -0400)
committerDerek <derekwbarnett@gmail.com>
Tue, 13 Jul 2010 01:53:57 +0000 (21:53 -0400)
bamtools_convert.cpp
bamtools_pileup.cpp [new file with mode: 0644]
bamtools_pileup.h [new file with mode: 0644]

index d3955636e332c1578bea8f95133e5ba385499a27..771feb2e839503777b9bf73572ee8e0a8a473225 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "bamtools_convert.h"
 #include "bamtools_options.h"
+#include "bamtools_pileup.h"
 #include "bamtools_utilities.h"
 #include "BGZF.h"
 #include "BamReader.h"
@@ -33,6 +34,7 @@ namespace BamTools {
     static const string FORMAT_FASTQ    = "fastq";
     static const string FORMAT_JSON     = "json";
     static const string FORMAT_SAM      = "sam";
+    static const string FORMAT_PILEUP   = "pileup";
     static const string FORMAT_WIGGLE   = "wig";
 
     // other constants
@@ -79,11 +81,18 @@ struct ConvertTool::ConvertSettings {
     bool HasFormat;
     bool HasRegion;
 
+    // pileup flags
+    bool HasFastaFilename;
+    bool IsPrintingPileupMapQualities;
+    
     // options
     vector<string> InputFiles;
     string OutputFilename;
     string Format;
     string Region;
+    
+    // pileup options
+    string FastaFilename;
 
     // constructor
     ConvertSettings(void)
@@ -91,6 +100,8 @@ struct ConvertTool::ConvertSettings {
         , HasOutputFilename(false)
         , HasFormat(false)
         , HasRegion(false)
+        , HasFastaFilename(false)
+        , IsPrintingPileupMapQualities(false)
         , OutputFilename(Options::StandardOut())
     { } 
 };  
@@ -114,6 +125,10 @@ ConvertTool::ConvertTool(void)
    
     OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
     Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more  details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+    
+    OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options");
+    Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, "");
+    Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts);
 }
 
 ConvertTool::~ConvertTool(void) {
@@ -155,6 +170,8 @@ ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
 
 bool ConvertTool::ConvertToolPrivate::Run(void) {
  
+    bool convertedOk = true;
+  
     // ------------------------------------
     // initialize conversion input/output
         
@@ -162,51 +179,84 @@ bool ConvertTool::ConvertToolPrivate::Run(void) {
     if ( !m_settings->HasInputFilenames ) 
         m_settings->InputFiles.push_back(Options::StandardIn());
     
-    // open files
+    // open input files
     BamMultiReader reader;
     reader.Open(m_settings->InputFiles);
     m_references = reader.GetReferenceData();
 
+    // set region if specified
     BamRegion region;
-    if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
-        if ( !reader.SetRegion(region) ) {
-           cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
+    if ( m_settings->HasRegion ) {
+        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
+            if ( !reader.SetRegion(region) )
+              cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
         }
     }
         
-    // if an output filename given, open outfile 
+    // if output file given
     ofstream outFile;
-    if ( m_settings->HasOutputFilename ) { 
+    if ( m_settings->HasOutputFilename ) {
+      
+        // open output file stream
         outFile.open(m_settings->OutputFilename.c_str());
-        if (!outFile) { cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; return false; }
-        m_out.rdbuf(outFile.rdbuf()); // set m_out to file's streambuf
+        if ( !outFile ) {
+            cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; 
+            return false; 
+        }
+        
+        // set m_out to file's streambuf
+        m_out.rdbuf(outFile.rdbuf()); 
     }
     
     // ------------------------
-    // determine format type
-    bool formatError = false;
-    bool convertedOk = true;
-    void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
-    if      ( m_settings->Format == FORMAT_BED )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
-    else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
-    else if ( m_settings->Format == FORMAT_FASTA )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
-    else if ( m_settings->Format == FORMAT_FASTQ )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
-    else if ( m_settings->Format == FORMAT_JSON )     pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
-    else if ( m_settings->Format == FORMAT_SAM )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
-    else if ( m_settings->Format == FORMAT_WIGGLE )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
-    else { 
-        cerr << "Unrecognized format: " << m_settings->Format << endl;
-        cerr << "Please see help|README (?) for details on supported formats " << endl;
-        formatError = true;
-        convertedOk = false;
+    // pileup is special case
+    if ( m_settings->Format == FORMAT_PILEUP ) {
+        
+        // initialize pileup input/output
+        Pileup pileup(&reader, &m_out);
+        
+        // ---------------------------
+        // configure pileup settings
+        
+        if ( m_settings->HasRegion ) 
+            pileup.SetRegion(region);
+        
+        if ( m_settings->HasFastaFilename ) 
+            pileup.SetFastaFilename(m_settings->FastaFilename);
+        
+        pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities );
+        
+        // run pileup
+        convertedOk = pileup.Run();
     }
     
-    // ------------------------
-    // do conversion
-    if ( !formatError ) {
-        BamAlignment a;
-        while ( reader.GetNextAlignment(a) ) {
-            (this->*pFunction)(a);
+    // -------------------------------------
+    // else determine 'simpler' format type
+    else {
+    
+        bool formatError = false;
+        void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
+        if      ( m_settings->Format == FORMAT_BED )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
+        else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
+        else if ( m_settings->Format == FORMAT_FASTA )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
+        else if ( m_settings->Format == FORMAT_FASTQ )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
+        else if ( m_settings->Format == FORMAT_JSON )     pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
+        else if ( m_settings->Format == FORMAT_SAM )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
+        else if ( m_settings->Format == FORMAT_WIGGLE )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
+        else { 
+            cerr << "Unrecognized format: " << m_settings->Format << endl;
+            cerr << "Please see help|README (?) for details on supported formats " << endl;
+            formatError = true;
+            convertedOk = false;
+        }
+        
+        // ------------------------
+        // do conversion
+        if ( !formatError ) {
+            BamAlignment a;
+            while ( reader.GetNextAlignment(a) ) {
+                (this->*pFunction)(a);
+            }
         }
     }
     
diff --git a/bamtools_pileup.cpp b/bamtools_pileup.cpp
new file mode 100644 (file)
index 0000000..5b84ead
--- /dev/null
@@ -0,0 +1,218 @@
+#include <vector>
+#include "BamMultiReader.h"
+#include "bamtools_pileup.h"
+using namespace std;
+using namespace BamTools;
+
+struct Pileup::PileupPrivate {
+  
+    // ---------------------
+    // data members
+  
+    // IO & settings
+    BamMultiReader* Reader;
+    ostream* OutStream;
+    string FastaFilename;
+    bool IsPrintingMapQualities;
+    BamRegion Region;
+    
+    // parsing data
+    int CurrentId;
+    int CurrentPosition;
+    vector<BamAlignment> CurrentData;
+    RefVector References;
+    
+    // ----------------------
+    // ctor
+    
+    PileupPrivate(BamMultiReader* reader, ostream* outStream)
+        : Reader(reader)
+        , OutStream(outStream)
+        , FastaFilename("")
+        , IsPrintingMapQualities(false)
+    { }
+    
+    // ----------------------
+    // internal methods
+    
+    void PrintCurrentData(void);
+    bool Run(void);
+};
+
+void Pileup::PileupPrivate::PrintCurrentData(void) {
+  
+    // remove any data that ends before CurrentPosition
+    size_t i = 0;
+    while ( i < CurrentData.size() ) {
+        if ( CurrentData[i].GetEndPosition() < CurrentPosition )
+            CurrentData.erase(CurrentData.begin() + i);
+        else
+            ++i;
+    }
+  
+    // if not data remains, return
+    if ( CurrentData.empty() ) return;
+    
+    // initialize empty strings
+    string bases     = "";
+    string baseQuals = "";
+    string mapQuals  = "";
+    
+    // iterate over alignments
+    vector<BamAlignment>::const_iterator dataIter = CurrentData.begin();
+    vector<BamAlignment>::const_iterator dataEnd  = CurrentData.end();
+    for ( ; dataIter != dataEnd; ++dataIter ) {
+        
+        // retrieve alignment
+        const BamAlignment& al = (*dataIter);
+        
+        // determine current base character & store
+        const char base = al.AlignedBases[CurrentPosition -al.Position];
+        if ( al.IsReverseStrand() ) 
+            bases.push_back( tolower(base) );
+        else 
+            bases.push_back( toupper(base) );
+        
+        // determine current base quality & store
+        baseQuals.push_back( al.Qualities[CurrentPosition - al.Position] );
+        
+        // if using mapQuals, determine current mapQual & store
+        if ( IsPrintingMapQualities ) {
+            int mapQuality = (int)(al.MapQuality + 33);
+            if ( mapQuality > 126 ) mapQuality = 126; 
+            mapQuals.push_back((char)mapQuality);
+        }
+    }
+    
+    // print results to OutStream
+    const string& refName = References[CurrentId].RefName;
+    const char refBase = 'N';
+    
+    *OutStream << refName << "\t" << CurrentPosition << "\t" << refBase << "\t" << CurrentData.size() << "\t" << bases << "\t" << baseQuals;
+    if ( IsPrintingMapQualities ) *OutStream << "\t" << mapQuals;
+    *OutStream << endl;
+}
+
+bool Pileup::PileupPrivate::Run(void) {
+  
+    // -----------------------------
+    // validate input & output 
+    
+    if ( !Reader ) {
+        cerr << "Pileup::Run() : Invalid multireader" << endl;
+        return false;
+    }
+    
+    if ( !OutStream) { 
+        cerr << "Pileup::Run() : Invalid output stream" << endl;
+        return false;
+    }
+    
+    References = Reader->GetReferenceData();
+    
+    // -----------------------------
+    // process input data
+    
+    // get first entry    
+    BamAlignment al;
+    if ( !Reader->GetNextAlignment(al) ) {
+        cerr << "Pileup::Run() : Could not read from multireader" << endl;
+        return false;
+    }
+    
+    // set initial markers & store first entry
+    CurrentId = al.RefID;
+    CurrentPosition = al.Position;
+    CurrentData.clear();
+    CurrentData.push_back(al);
+    
+    // iterate over remaining data
+    while ( Reader->GetNextAlignment(al) ) {
+        
+        // if same reference
+        if ( al.RefID == CurrentId ) {
+          
+            // if same position, store and move on
+            if ( al.Position == CurrentPosition )
+                CurrentData.push_back(al);
+            
+            // if less than CurrentPosition - sorting error => ABORT
+            else if ( al.Position < CurrentPosition ) {
+                cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
+                return false;
+            }
+            
+            // else print pileup data until 'catching up' to CurrentPosition
+            else {
+                while ( al.Position > CurrentPosition ) {
+                    PrintCurrentData();
+                    ++CurrentPosition;
+                }
+                CurrentData.push_back(al);
+            }
+        } 
+        
+        // if reference ID less than CurrentID - sorting error => ABORT
+        else if ( al.RefID < CurrentId ) {
+            cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
+            return false;
+        }
+        
+        // else moved forward onto next reference
+        else {
+            
+            // print any remaining pileup data from previous reference
+            while ( !CurrentData.empty() ) {
+                PrintCurrentData();
+                ++CurrentPosition;
+            }
+            
+            // store first entry on this new reference, update markers
+            CurrentData.clear();
+            CurrentData.push_back(al);
+            CurrentId = al.RefID;
+            CurrentPosition = al.Position;
+        }
+    }
+    
+    // ------------------------------------
+    // handle  any remaining data entries
+    
+    while ( !CurrentData.empty() ) {
+        PrintCurrentData();
+        ++CurrentPosition;
+    }
+  
+    // -------------------------
+    // return success
+    
+    return true;
+}
+
+// ----------------------------------------------------------
+// Pileup implementation
+
+Pileup::Pileup(BamMultiReader* reader, ostream* outStream) {
+    d = new PileupPrivate(reader, outStream);
+}
+
+Pileup::~Pileup(void) {
+    delete d;
+    d = 0;
+}
+
+bool Pileup::Run(void) {
+    return d->Run();
+}
+
+void Pileup::SetFastaFilename(const string& filename) {
+    d->FastaFilename = filename;
+}
+
+void Pileup::SetIsPrintingMapQualities(bool ok) {
+    d->IsPrintingMapQualities = ok;
+}
+
+void Pileup::SetRegion(const BamRegion& region) {
+    d->Region = region;
+}
diff --git a/bamtools_pileup.h b/bamtools_pileup.h
new file mode 100644 (file)
index 0000000..b28059c
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef BAMTOOLS_PILEUP_H
+#define BAMTOOLS_PILEUP_H
+
+#include <iostream>
+#include <string>
+
+namespace BamTools {
+
+class BamMultiReader;
+class BamRegion;
+
+class Pileup {
+  
+    public:
+        Pileup(BamMultiReader* reader, std::ostream* outStream);
+        ~Pileup(void);
+        
+    public:
+        bool Run(void);
+        void SetFastaFilename(const std::string& filename);
+        void SetIsPrintingMapQualities(bool ok);  
+        void SetRegion(const BamRegion& region);
+  
+    private:
+        struct PileupPrivate;
+        PileupPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_PILEUP_H
\ No newline at end of file