]> git.donarmstrong.com Git - bamtools.git/commitdiff
Implemented CountTool, cleaned up MergeTool.
authorDerek <derekwbarnett@gmail.com>
Wed, 2 Jun 2010 18:13:24 +0000 (14:13 -0400)
committerDerek <derekwbarnett@gmail.com>
Wed, 2 Jun 2010 18:13:24 +0000 (14:13 -0400)
bamtools_count.cpp
bamtools_merge.cpp
bamtools_utilities.cpp
bamtools_utilities.h

index 1560e1c5d4bdc5c79b973b9f5fef6c72e145cfab..47e9965fc4f4c251b82b902bbca907ef71f6bfc8 100644 (file)
@@ -12,6 +12,7 @@
 // ***************************************************************************
 
 #include <iostream>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -29,16 +30,19 @@ using namespace BamTools;
 struct CountTool::CountSettings {
 
     // flags
+    bool HasBamIndexFilename;
     bool HasInputBamFilename;
     bool HasRegion;
 
     // filenames
-    std::string InputBamFilename;
-    std::string Region;
+    string BamIndexFilename;
+    string InputBamFilename;
+    string Region;
     
     // constructor
     CountSettings(void)
-        : HasInputBamFilename(false)
+        : HasBamIndexFilename(false)
+        , HasInputBamFilename(false)
         , HasRegion(false)
         , InputBamFilename(Options::StandardIn())
     { }  
@@ -52,14 +56,15 @@ CountTool::CountTool(void)
     , m_settings(new CountSettings)
 { 
     // set program details
-    Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in <filename> ");
+    Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in <filename> [-region REGION -index <filename>]");
     
     // set up options 
     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
     Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
+    Options::AddValueOption("-index", "BAM index filename", "the BAM index file", "", m_settings->HasBamIndexFilename, m_settings->BamIndexFilename, IO_Opts);
     
     OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
-    Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+    Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for optimal performance.", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
 }
 
 CountTool::~CountTool(void) { 
@@ -78,31 +83,151 @@ int CountTool::Run(int argc, char* argv[]) {
     Options::Parse(argc, argv, 1);
 
     //open our BAM reader
-//     BamReader reader;
-//     reader.Open(m_settings.InputBamFilename);
-
-    // count alignments
-    string startChrom;
-    string stopChrom;
-    int startPos;
-    int stopPos;
+    BamReader reader;
+    reader.Open(m_settings->InputBamFilename);
+
+    // alignment counter
+    int alignmentCount(0);
+    
+    // set up error handling
+    ostringstream errorStream("");
+    bool foundError(false);
     
+    // if no region specified, count entire file 
     if ( !m_settings->HasRegion ) {
-        cerr << "Counting all alignments " << endl;
+        BamAlignment al;
+        while ( reader.GetNextAlignment(al) ) 
+            ++alignmentCount;
     } else {
+        
+        // parse region string for desired region
+        string startChrom;
+        string stopChrom;
+        int startPos;
+        int stopPos;
         if ( Utilities::ParseRegionString(m_settings->Region, startChrom, startPos, stopChrom, stopPos) ) {
-            cerr << "Counting only alignments in region " << m_settings->Region << endl;
-            cerr << "StartChrom: " << startChrom << endl;
-            cerr << "StartPos:   " << startPos << endl;
-            cerr << "StopChrom:  " << stopChrom << endl;
-            cerr << "StopPos:    " << stopPos << endl;
+
+            const RefVector references = reader.GetReferenceData();
+
+            // -------------------------------
+            // validate start ref & position
+            
+            int startRefID = reader.GetReferenceID(startChrom);
+            cout << "startRefID: " << startRefID << endl;
+            
+            // startRefID not found
+            if ( startRefID == (int)references.size() ) {
+                foundError = true;
+                errorStream << "Start chrom: " << startChrom << " not found in BAM file." << endl;
+            } 
+            
+            // valid startRefID, check startPos
+            else {
+                const RefData& reference = references.at(startRefID);
+                
+                // startPos too large
+                if ( startPos > reference.RefLength ) {
+                    foundError = true;
+                    errorStream << "Start pos: " << startPos << " is larger than expected." << endl;
+                }
+            }
+            
+            // -------------------------------
+            // validate stop ref & position
+            
+            int stopRefID = reader.GetReferenceID(stopChrom);
+
+            // skip if error already found
+            if ( !foundError ) { 
+              
+                // stopRefID not found
+                if ( stopRefID == (int)references.size() ) {
+                    foundError = true;
+                    errorStream << "Stop chrom: " << stopChrom << " not found in BAM file." << endl;
+                } 
+                
+                // valid stopRefID, check stopPos
+                else {
+                    const RefData& reference = references.at(stopRefID);
+                    
+                    // stopPos too large
+                    if ( stopPos > reference.RefLength ) {
+                        foundError = true;
+                        errorStream << "Stop pos: " << stopPos << " is larger than expected." << endl;
+                    } 
+                    
+                    // no stopPos specified, set to reference end
+                    else if ( stopPos == -1 ) {
+                        stopPos = reference.RefLength;
+                    } 
+                }
+            }
+
+            // -------------------------------
+            // if refs & positions valid, retrieve data
+            
+            if ( !foundError ) {
+
+                // has index, we can jump directly to 
+                if ( m_settings->HasBamIndexFilename ) {
+                  
+                    // this is kind of a hack...?
+                    // re-open BamReader, this time with the index file
+                    reader.Close();
+                    reader.Open(m_settings->InputBamFilename, m_settings->BamIndexFilename);
+                  
+                    // attempt Jump(), catch error
+                    if ( !reader.Jump(startRefID, startPos) ) {
+                        foundError = true;
+                        errorStream << "Could not jump to desired REGION: " << m_settings->Region << endl;
+                    }
+                }
+                
+                else {
+                  
+                    // read through sequentially, until first overlapping read is found
+                    BamAlignment al;
+                    bool alignmentFound(false);
+                    while( reader.GetNextAlignment(al) ) {
+                        if ( (al.RefID == startRefID ) && ( (al.Position + al.Length) >= startPos) ) {
+                            alignmentFound = true;
+                            break;
+                        }
+                    }
+                    
+                    // if overlapping alignment found (not EOF), increment counter
+                    if ( alignmentFound) ++ alignmentCount; 
+                }
+                
+                // -----------------------------
+                // count alignments until 
+                
+                if ( !foundError ) {
+                    // while valid alignment AND
+                    // ( either current ref is before stopRefID OR
+                    //   current ref stopRefID but we're before stopPos )
+                    BamAlignment al;
+                    while ( reader.GetNextAlignment(al) && ((al.RefID < stopRefID ) || (al.RefID == stopRefID && al.Position <= stopPos)) )
+                        ++alignmentCount;
+                }
+            }
+        } 
+        
+        // could not parse REGION string, set error
+        else {
+            foundError = true;
+            errorStream << "Could not parse desired REGION: " << m_settings->Region << endl;
+            errorStream << "Please see README for details on accepted REGION formats" << endl;
         }
     }
      
-    cerr << " from " << m_settings->InputBamFilename << endl;
-    cerr << "FEATURE NOT YET IMPLEMENTED!" << endl;
-
+    // print errors OR results 
+    if ( foundError )
+        cerr << errorStream.str() << endl;
+    else
+        cout << alignmentCount << endl;
+    
     // clean & exit
-//     reader.Close();
-    return 0;
+    reader.Close();
+    return (int)foundError;
 }
\ No newline at end of file
index a6e79c825de21cbd7560670a14eec41cfdddbc7c..402d3773d85696a376d449612dc948cb31663911 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 26 May 2010
+// Last modified: 2 June 2010
 // ---------------------------------------------------------------------------
 // Merges multiple BAM files into one.
 //
@@ -32,20 +32,20 @@ struct MergeTool::MergeSettings {
     // flags
     bool HasInputBamFilename;
     bool HasOutputBamFilename;
-    bool HasRegion;
+//     bool HasRegion;
     
     // filenames
     vector<string> InputFiles;
     
     // other parameters
     string OutputFilename;
-    string Region;
+//     string Region;
     
     // constructor
     MergeSettings(void)
         : HasInputBamFilename(false)
         , HasOutputBamFilename(false)
-        , HasRegion(false)
+//         , HasRegion(false)
         , OutputFilename(Options::StandardOut())
     { }
 };  
@@ -58,15 +58,15 @@ MergeTool::MergeTool(void)
     , m_settings(new MergeSettings)
 {
     // set program details
-    Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in <filename> ...] [-region REGION] [-out <filename>]");
+    Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in <filename> -in <filename> ...] [-out <filename>]");
     
     // set up options 
     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
-    Options::AddValueOption("-in",  "BAM filename", "the input BAM file",  "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts);
-    Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts);
+    Options::AddValueOption("-in",  "BAM filename", "the input BAM file(s)",  "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts);
+    Options::AddValueOption("-out", "BAM filename", "the output BAM file",    "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts);
     
-    OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
-    Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+//     OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
+//     Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
 }
 
 MergeTool::~MergeTool(void) {
@@ -87,35 +87,26 @@ int MergeTool::Run(int argc, char* argv[]) {
      // set to default input if none provided
     if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn());
     
-//     // opens the BAM files without checking for indexes
-//     BamMultiReader reader;
-//     reader.Open(m_settings->InputFiles, false); 
-// 
-//     // retrieve header & reference dictionary info
-//     std::string mergedHeader = reader.GetHeaderText();
-//     RefVector references = reader.GetReferenceData();
-// 
-//     // open BamWriter
-//     BamWriter writer;
-//     writer.Open(m_settings->OutputFilename, mergedHeader, references);
-// 
-//     // if desired region provided
-//     if ( m_settings->HasRegion ) {
-//         // parse region string
-//         // only get alignments from this region
-//     } 
-//     
-//     // else get all alignments
-//     else {
-//         // store alignments to output file
-//         BamAlignment bAlignment;
-//         while (reader.GetNextAlignment(bAlignment)) {
-//             writer.SaveAlignment(bAlignment);
-//         }
-//     }
-//
-//     // clean & exit
-//     reader.Close();
-//     writer.Close();
+    // opens the BAM files without checking for indexes
+    BamMultiReader reader;
+    reader.Open(m_settings->InputFiles, false); 
+
+    // retrieve header & reference dictionary info
+    std::string mergedHeader = reader.GetHeaderText();
+    RefVector references = reader.GetReferenceData();
+
+    // open BamWriter
+    BamWriter writer;
+    writer.Open(m_settings->OutputFilename, mergedHeader, references);
+
+    // store alignments to output file
+    BamAlignment bAlignment;
+    while (reader.GetNextAlignment(bAlignment)) {
+        writer.SaveAlignment(bAlignment);
+    }
+
+    // clean & exit
+    reader.Close();
+    writer.Close();
     return 0;  
 }
index 8eea9a6ee6ffdd1b98e50c32737312a2f49a38e6..b5440df12aa4164169f6f7b942894ca6f2fe6ac2 100644 (file)
@@ -35,8 +35,8 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom
     // use BamReader methods to check if its valid for current BAM file
     if ( foundFirstColon == string::npos ) {
         startChrom = regionString;
-        startPos   = -1;                 // ** not sure about these defaults (should stopChrom == startChrom if same?)
-        stopChrom  = "";
+        startPos   = 0;
+        stopChrom  = regionString;
         stopPos    = -1;
         return true;
     }
@@ -55,7 +55,7 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom
         // store contents before colon as startChrom, after as startPos
         if ( foundRangeDots == string::npos ) {
             startPos   = atoi( regionString.substr(foundFirstColon+1).c_str() ); 
-            stopChrom  = "";
+            stopChrom  = startChrom;
             stopPos    = -1;
             return true;
         } 
@@ -72,7 +72,7 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom
             // no second colon found
             // so we have a "standard" chrom:start..stop input format (on single chrom)
             if ( foundSecondColon == string::npos ) {
-                stopChrom  = "";
+                stopChrom  = startChrom;
                 stopPos    = atoi( regionString.substr(foundRangeDots+2).c_str() );
                 return true;
             }
@@ -80,7 +80,7 @@ bool Utilities::ParseRegionString(const string& regionString, string& startChrom
             // second colon found
             // so we have a range requested across 2 chrom's
             else {
-                stopChrom  = regionString.substr(foundRangeDots+2, regionString.length()-foundSecondColon-1);
+                stopChrom  = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
                 stopPos    = atoi( regionString.substr(foundSecondColon+1).c_str() );
                 return true;
             }
index 79462c7be3ac428c15b51761f1c06d7bc7910c5f..8a5d36c6c1e1e5b275055939406ed7e8412ea5c9 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 27 May 2010
+// Last modified: 2 June 2010
 // ---------------------------------------------------------------------------
 // Provides general utilities used by BamTools sub-tools.
 // ***************************************************************************
 #ifndef BAMTOOLS_UTILITIES_H
 #define BAMTOOLS_UTILITIES_H
 
-#include <cstdlib>
-#include <iostream>
 #include <string>
 
 namespace BamTools {
 
-// Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables
-// Returns successful parse (true/false)
-static inline
-bool ParseRegionString(const std::string& regionString, std::string& startChrom, int& startPos, std::string& stopChrom, int& stopPos) {
-    
-    // shouldn't call this function with empty string but worth checking 
-    // checked first for clarity purposes later on, since we can assume at least some content in the string
-    if ( regionString.empty() ) { 
-        std::cerr << "Empty REGION. Usual format (e.g. chr2:1000..2000). See README for more detailed uses." << std::endl;
-        return false; 
-    }
+class BamReader;  
   
-    // non-empty string, look for a colom
-    size_t foundFirstColon = regionString.find(':');
-    
-    // no colon found
-    // going to use entire contents of requested chromosome 
-    // just store entire region string as startChrom name
-    // use BamReader methods to check if its valid for current BAM file
-    if ( foundFirstColon == std::string::npos ) {
-        startChrom = regionString;
-        startPos   = -1;                                                        // ** not sure about these defaults (should stopChrom == startChrom if same?)
-        stopChrom  = "";
-        stopPos    = -1;
-        return true;
-    }
-    
-    // colon found, so we at least have some sort of startPos requested
-    else {
-      
-        // store start chrom from beginning to first colon
-        startChrom = regionString.substr(0,foundFirstColon);
-        
-        // look for ".." after the colon
-        size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
-        
-        // no dots found
-        // so we have a startPos but no range
-        // store contents before colon as startChrom, after as startPos
-        if ( foundRangeDots == std::string::npos ) {
-            startPos   = atoi( regionString.substr(foundFirstColon+1).c_str() ); 
-            stopChrom  = "";
-            stopPos    = -1;
-            return true;
-        } 
-        
-        // ".." found, so we have some sort of range selected
-        else {
-          
-            // store startPos between first colon and range dots ".."
-            startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
-          
-            // look for second colon
-            size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
-            
-            // no second colon found
-            // so we have a "standard" chrom:start..stop input format (on single chrom)
-            if ( foundSecondColon == std::string::npos ) {
-                stopChrom  = "";
-                stopPos    = atoi( regionString.substr(foundRangeDots+2).c_str() );
-                return true;
-            }
-            
-            // second colon found
-            // so we have a range requested across 2 chrom's
-            else {
-                stopChrom  = regionString.substr(foundRangeDots+2, regionString.length()-foundSecondColon-1);
-                stopPos    = atoi( regionString.substr(foundSecondColon+1).c_str() );
-                return true;
-            }
-        }
-    }
+class Utilities {
   
-    // shouldn't get here - any code path that does?
-    // if not, what does true/false really signify?
-    return false;
-}
+    public:
+        // Parses a REGION string, stores in (startChrom, startPos, stopChrom, stopPos) variables
+        // Returns successful parse (true/false)
+        static bool ParseRegionString(const std::string& regionString,
+                                      std::string& startChrom,
+                                      int& startPos,
+                                      std::string& stopChrom,
+                                      int& stopPos); 
+};
 
 } // namespace BamTools