Major performance boost to startup & random-access - especially for the

author derek <derekwbarnett@gmail.com>

Tue, 5 Apr 2011 16:43:31 +0000 (12:43 -0400)

committer derek <derekwbarnett@gmail.com>

Tue, 5 Apr 2011 16:43:31 +0000 (12:43 -0400)
author derek <derekwbarnett@gmail.com>
Tue, 5 Apr 2011 16:43:31 +0000 (12:43 -0400)
committer derek <derekwbarnett@gmail.com>
Tue, 5 Apr 2011 16:43:31 +0000 (12:43 -0400)
diff --git a/src/api/BamConstants.h b/src/api/BamConstants.h

index 6a97980b400298593733a370cb91239de8d2c4a6..6a1695a65be26005a89d49a814c4237e11e13f66 100644 (file)
--- a/src/api/BamConstants.h
+++ b/src/api/BamConstants.h
@@ -1,10 +1,20 @@
+// ***************************************************************************
+// BamConstants.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 5 April 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides basic constants for handling BAM files.
+// ***************************************************************************
+
  #ifndef BAM_CONSTANTS_H
  #define BAM_CONSTANTS_H
  
  #include <string>
  
  /*! \namespace BamTools::Constants
-    \brief Contains most of the constants used throughout BamTools.
+    \brief Provides basic constants for handling BAM files.
  */
  
  namespace BamTools {
@@ -43,6 +53,8 @@ const int BAM_CIGAR_SOFTCLIP = 4;
  const int BAM_CIGAR_HARDCLIP = 5;
  const int BAM_CIGAR_PAD      = 6;
  
+// TODO: Add support for 'X' and '=' ops
+
  const char BAM_CIGAR_MATCH_CHAR    = 'M';
  const char BAM_CIGAR_INS_CHAR      = 'I';
  const char BAM_CIGAR_DEL_CHAR      = 'D';
@@ -103,8 +115,8 @@ const int Z_DEFAULT_MEM_LEVEL = 8;
  // BZGF constants
  const int BGZF_BLOCK_HEADER_LENGTH = 18;
  const int BGZF_BLOCK_FOOTER_LENGTH = 8;
-const int BGZF_MAX_BLOCK_SIZE      = 65536;
-const int BGZF_DEFAULT_BLOCK_SIZE  = 65536;
+const int BGZF_MAX_BLOCK_SIZE      = 262144;
+const int BGZF_DEFAULT_BLOCK_SIZE  = 262144;
  
  } // namespace Constants
  } // namespace BamTools
diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp

deleted file mode 100644 (file)

index 3e5f86e..0000000
--- a/src/api/BamIndex.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-// ***************************************************************************
-// BamIndex.cpp (c) 2009 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 23 March 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides index functionality - both for the default (standardized) BAM 
-// index format (.bai) as well as a BamTools-specific (nonstandard) index 
-// format (.bti).
-// ***************************************************************************
-
-#include <api/BamIndex.h>
-#include <api/BamReader.h>
-#include <api/internal/BamStandardIndex_p.h>
-#include <api/internal/BamToolsIndex_p.h>
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdio>
-#include <cstdlib>
-#include <algorithm>
-#include <iostream>
-#include <map>
-using namespace std;
-
-/*! \class BamTools::BamIndex
-    \brief Provides methods for generating & loading BAM index files.
-
-    This class straddles the line between public API and internal
-    implementation detail. Most client code should never have to use this
-    class directly.
-
-    It is exposed to the public API to allow advanced users to implement
-    their own custom indexing schemes.
-
-    N.B. - Please note that if you wish to derive your own subclass, you are
-           entering waters that are not well-documented at the moment and are
-           likely to be changing soon anyway. Implementing a custom index is
-           technically do-able at the moment, but the learning curve is (at the
-           moment) overly steep. Changes will be coming soon to alleviate some
-           of this headache.
-*/
-
-// ctor
-BamIndex::BamIndex(void)
-    : m_indexStream(0)
-    , m_indexFilename("")
-    , m_cacheMode(BamIndex::LimitedIndexCaching)
-{ }
-
-// dtor
-BamIndex::~BamIndex(void) {
-    if ( IsOpen() ) fclose(m_indexStream);
-    m_indexFilename = "";
-}
-
-// return true if FILE* is open
-bool BamIndex::IsOpen(void) const {
-    return ( m_indexStream != 0 );
-}
-
-// loads existing data from file into memory
-bool BamIndex::Load(const string& filename)  {
-
-    // open index file, abort on error
-    if ( !OpenIndexFile(filename, "rb") ) {
-        fprintf(stderr, "BamIndex ERROR: unable to open the BAM index file %s for reading.\n", filename.c_str());
-        return false;
-    }
-
-    // check magic number
-    if ( !LoadHeader() ) {
-        fclose(m_indexStream);
-        return false;
-    }
-
-    // load reference data (but only keep in memory if full caching requested)
-    bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
-    if ( !LoadAllReferences(saveInitialLoad) ) {
-        fclose(m_indexStream);
-        return false;
-    }
-
-    // update index cache based on selected mode
-    UpdateCache();
-
-    // return success
-    return true;
-}
-
-// opens index file for reading/writing, return true if opened OK
-bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
-
-    // attempt to open file, return false if error
-    m_indexStream = fopen(filename.c_str(), mode.c_str());
-    if ( m_indexStream == 0 )  return false;
-
-    // otherwise save filename & return sucess
-    m_indexFilename = filename;
-    return true;
-}
-
-// rewind index file to beginning of index data, return true if rewound OK
-bool BamIndex::Rewind(void) {
-    return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
-}
-
-// change the index caching behavior
-void BamIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) {
-    if ( mode != m_cacheMode ) {
-        m_cacheMode = mode;
-        UpdateCache();
-    }
-}
-
-// updates in-memory cache of index data, depending on current cache mode
-void BamIndex::UpdateCache(void) {
-
-    // skip if file not open
-    if ( !IsOpen() ) return;
-
-    // reflect requested cache mode behavior
-    switch ( m_cacheMode ) {
-
-        case (BamIndex::FullIndexCaching) :
-            Rewind();
-            LoadAllReferences(true);
-            break;
-
-        case (BamIndex::LimitedIndexCaching) :
-            if ( HasFullDataCache() )
-                KeepOnlyFirstReferenceOffsets();
-            else {
-                ClearAllData();
-                SkipToFirstReference();
-                LoadFirstReference(true);
-            }
-            break;
-
-        case(BamIndex::NoIndexCaching) :
-            ClearAllData();
-            break;
-
-        default :
-            // unreachable
-            ;
-    }
-}
-
-// writes in-memory index data out to file
-bool BamIndex::Write(const string& bamFilename) {
-
-    // open index file for writing
-    string indexFilename = bamFilename + Extension();
-    if ( !OpenIndexFile(indexFilename, "wb") ) {
-        fprintf(stderr, "BamIndex ERROR: could not open file to save index data.\n");
-        return false;
-    }
-
-    // write index header data
-    if ( !WriteHeader() ) {
-        fprintf(stderr, "BamIndex ERROR: there was a problem writing index metadata to the new index file.\n");
-        fflush(m_indexStream);
-        fclose(m_indexStream);
-        exit(1);
-    }
-
-    // write main index data
-    if ( !WriteAllReferences() ) {
-        fprintf(stderr, "BamIndex ERROR: there was a problem writing index data to the new index file.\n");
-        fflush(m_indexStream);
-        fclose(m_indexStream);
-        exit(1);
-    }
-
-    // flush any remaining output
-    fflush(m_indexStream);
-    fclose(m_indexStream);
-
-    // re-open index file for later reading
-    if ( !OpenIndexFile(indexFilename, "rb") ) {
-        fprintf(stderr, "BamIndex ERROR: could not open newly created index file for reading.\n");
-        return false;
-    }
-
-    // save index filename & return success
-    m_indexFilename = indexFilename;
-    return true;
-}
diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h

index 5ba1469382d70b8f4a050082737f977e929ffec7..00a8f0174458f542268944d30aa39997400d0338 100644 (file)
--- a/src/api/BamIndex.h
+++ b/src/api/BamIndex.h
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 24 February 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides basic BAM index interface
  // ***************************************************************************
@@ -13,20 +13,27 @@
  
  #include <api/api_global.h>
  #include <api/BamAux.h>
-#include <iostream>
  #include <string>
-#include <vector>
  
  namespace BamTools {
  
-class BamReader;
-
  namespace Internal {
      class BamReaderPrivate;
  } // namespace Internal
  
-// --------------------------------------------------  
-// BamIndex base class
+/*! \class BamTools::BamIndex
+    \brief Provides methods for generating & loading BAM index files.
+
+    This class straddles the line between public API and internal
+    implementation detail. Most client code should never have to use this
+    class directly.
+
+    It is exposed to the public API to allow advanced users to implement
+    their own custom indexing schemes.
+
+    More documentation on methods & enums coming soon.
+*/
+
  class API_EXPORT BamIndex {
  
      // enums
@@ -44,75 +51,28 @@ class API_EXPORT BamIndex {
    
      // ctor & dtor
      public:
-        BamIndex(void);
-        virtual ~BamIndex(void);
+        BamIndex(Internal::BamReaderPrivate* reader) : m_reader(reader) { }
+        virtual ~BamIndex(void) { }
          
      // index interface
      public:
-        // creates index data (in-memory) from @reader data
-        virtual bool Build(Internal::BamReaderPrivate* reader) =0;
-        // returns supported file extension
-        virtual const std::string Extension(void) =0;
+        // builds index from associated BAM file & writes out to index file
+        virtual bool Create(void) =0; // creates index file from BAM file
          // returns whether reference has alignments or no
          virtual bool HasAlignments(const int& referenceID) const =0;
-        // attempts to use index data to jump to @region in @reader; returns success/fail
+        // attempts to use index data to jump to @region, returns success/fail
          // a "successful" jump indicates no error, but not whether this region has data
          //   * thus, the method sets a flag to indicate whether there are alignments
          //     available after the jump position
-        virtual bool Jump(Internal::BamReaderPrivate* reader,
-                          const BamTools::BamRegion& region,
-                          bool* hasAlignmentsInRegion) =0;
+        virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0;
          // loads existing data from file into memory
-        virtual bool Load(const std::string& filename);
+        virtual bool Load(const std::string& filename) =0;
          // change the index caching behavior
-        virtual void SetCacheMode(const BamIndex::IndexCacheMode& mode);
-        // writes in-memory index data out to file 
-        // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
-        virtual bool Write(const std::string& bamFilename);
-
-    // derived-classes MUST provide implementation
-    protected:
-        // clear all current index offset data in memory
-        virtual void ClearAllData(void) =0;
-        // return file position after header metadata
-        virtual off_t DataBeginOffset(void) const =0;
-        // return true if all index data is cached
-        virtual bool HasFullDataCache(void) const =0;
-        // clears index data from all references except the first
-        virtual void KeepOnlyFirstReferenceOffsets(void) =0;
-        // load index data for all references, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        virtual bool LoadAllReferences(bool saveData = true) =0;
-        // load first reference from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        virtual bool LoadFirstReference(bool saveData = true) =0;
-        // load header data from index file, return true if loaded OK
-        virtual bool LoadHeader(void) =0;
-        // position file pointer to first reference begin, return true if skipped OK
-        virtual bool SkipToFirstReference(void) =0;
-        // write index reference data
-        virtual bool WriteAllReferences(void) =0;
-        // write index header data
-        virtual bool WriteHeader(void) =0;
-
-    // internal methods (but available to derived classes)
-    protected:
-        // rewind index file to beginning of index data, return true if rewound OK
-        bool Rewind(void);
-
-    private:
-        // return true if FILE* is open
-        bool IsOpen(void) const;
-        // opens index file according to requested mode, return true if opened OK
-        bool OpenIndexFile(const std::string& filename, const std::string& mode);
-        // updates in-memory cache of index data, depending on current cache mode
-        void UpdateCache(void);
+        virtual void SetCacheMode(const BamIndex::IndexCacheMode& mode) =0;
  
      // data members
      protected:
-        FILE* m_indexStream;
-        std::string m_indexFilename;
-        BamIndex::IndexCacheMode  m_cacheMode;
+        Internal::BamReaderPrivate* m_reader; // copy, not ownedprivate:
  };
  
  } // namespace BamTools
diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt

index 57efba25a86228c524918cb03b48b8d6330b580b..a09d9a11ef4c39322edb8a5d7bf0cb94768cc59a 100644 (file)
--- a/src/api/CMakeLists.txt
+++ b/src/api/CMakeLists.txt
@@ -14,7 +14,6 @@ add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library sym
  # list of all BamTools API source (.cpp) files
  set( BamToolsAPISources
          BamAlignment.cpp
-        BamIndex.cpp
          BamMultiReader.cpp
          BamReader.cpp
          BamWriter.cpp
diff --git a/src/api/internal/BamIndexFactory_p.cpp b/src/api/internal/BamIndexFactory_p.cpp

index dcc11cebc25bcb1237d79731604857ebaf744a6e..69b372bb02a770cbaa3d6c1449c1a73d23ffc1e8 100644 (file)
--- a/src/api/internal/BamIndexFactory_p.cpp
+++ b/src/api/internal/BamIndexFactory_p.cpp
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 21 March 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides interface for generating BamIndex implementations
  // ***************************************************************************
@@ -24,16 +24,16 @@ const string BamIndexFactory::CreateIndexFilename(const string& bamFilename,
                                                    const BamIndex::IndexType& type)
  {
      switch ( type ) {
-        case ( BamIndex::STANDARD ) : return ( bamFilename + BAI_EXTENSION );
-        case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BTI_EXTENSION );
+        case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() );
+        case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() );
          default :
-            fprintf(stderr, "BamIndexFactory ERROR: unknown index type %u\n", type);
+            cerr << "BamIndexFactory ERROR: unknown index type" << type << endl;
              return string();
      }
  }
  
  // creates a new BamIndex object, depending on extension of @indexFilename
-BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename) {
+BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename, BamReaderPrivate* reader) {
  
      // if file doesn't exist, return null index
      if ( !BamTools::FileExists(indexFilename) )
@@ -46,18 +46,21 @@ BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename)
          return 0;
  
      // create index based on extension
-    if      ( extension == BAI_EXTENSION ) return new BamStandardIndex;
-    else if ( extension == BTI_EXTENSION ) return new BamToolsIndex;
-    else                                   return 0;
+    if      ( extension == BamStandardIndex::Extension() ) return new BamStandardIndex(reader);
+    else if ( extension == BamToolsIndex::Extension()    ) return new BamToolsIndex(reader);
+    else
+        return 0;
  }
  
  // creates a new BamIndex, object of requested @type
-BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type) {
+BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type,
+                                             BamReaderPrivate* reader)
+{
      switch ( type ) {
-        case ( BamIndex::STANDARD ) : return new BamStandardIndex;
-        case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex;
+        case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader);
+        case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader);
          default :
-            fprintf(stderr, "BamIndexFactory ERROR: unknown index type %u\n", type);
+            cerr << "BamIndexFactory ERROR: unknown index type " << type << endl;
              return 0;
      }
  }
diff --git a/src/api/internal/BamIndexFactory_p.h b/src/api/internal/BamIndexFactory_p.h

index 4ef95851c023021d84f7ff30b740b2c9a059937b..f060d2cd4e766179cf81162897727ec06a0236a5 100644 (file)
--- a/src/api/internal/BamIndexFactory_p.h
+++ b/src/api/internal/BamIndexFactory_p.h
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 26 January 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides interface for generating BamIndex implementations
  // ***************************************************************************
@@ -22,9 +22,11 @@ class BamIndexFactory {
      // static interface methods
      public:
          // creates a new BamIndex object, depending on extension of @indexFilename
-        static BamIndex* CreateIndexFromFilename(const std::string& indexFilename);
+        static BamIndex* CreateIndexFromFilename(const std::string& indexFilename,
+                                                 BamReaderPrivate* reader);
          // creates a new BamIndex object, of requested @type
-        static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type);
+        static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type,
+                                           BamReaderPrivate* reader);
          // returns name of existing index file that corresponds to @bamFilename
          // will defer to @preferredType if possible
          // if @preferredType not found, will attempt to load any supported index type
diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp

index 583085c7cd11782b68337f483113cd8ef702110a..e789d06ee35c93d3945427ec99a4fe544af49930 100644 (file)
--- a/src/api/internal/BamMultiReader_p.cpp
+++ b/src/api/internal/BamMultiReader_p.cpp
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 21 March 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Functionality for simultaneously reading multiple BAM files
  // *************************************************************************
@@ -180,6 +180,8 @@ SamHeader BamMultiReaderPrivate::GetHeader(void) const {
  // makes a virtual, unified header for all the bam files in the multireader
  string BamMultiReaderPrivate::GetHeaderText(void) const {
  
+    // TODO: merge SamHeader objects instead of parsing string data (again)
+
      // if only one reader is open
      if ( m_readers.size() == 1 ) {
  
@@ -545,7 +547,7 @@ BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) {
  
      // reader could not open
      else {
-        cerr << "BamMultiReader WARNING: Could not open: "
+        cerr << "BamMultiReader WARNING: Could not open "
                << filename << ", ignoring file" << endl;
      }
  
@@ -646,7 +648,7 @@ bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
  
          // attempt to set BamReader's region of interest
          if ( !reader->SetRegion(region) ) {
-            cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() << " to "
+            cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to "
                   << region.LeftRefID  << ":" << region.LeftPosition   << ".."
                   << region.RightRefID << ":" << region.RightPosition  << endl;
          }
diff --git a/src/api/internal/BamRandomAccessController_p.cpp b/src/api/internal/BamRandomAccessController_p.cpp

index a7856102c2e61dedcd76796e6553d31492bb3005..89636bbf0ecf46a76b4ec5a63cff085e6b899a04 100644 (file)
--- a/src/api/internal/BamRandomAccessController_p.cpp
+++ b/src/api/internal/BamRandomAccessController_p.cpp
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 21 March 2011(DB)
+// Last modified: 5 April 2011(DB)
  // ---------------------------------------------------------------------------
  // Manages random access operations in a BAM file
  // **************************************************************************
@@ -151,27 +151,22 @@ bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader,
          return false;
  
      // create new index of requested type
-    BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type);
+    BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader);
      if ( newIndex == 0 ) {
          cerr << "BamRandomAccessController ERROR: could not create index of type " << type << endl;
          return false;
      }
  
      // attempt to build index from current BamReader file
-    if ( !newIndex->Build(reader) ) {
-        cerr << "BamRandomAccessController ERROR: could not build index on BAM file: " << reader->Filename() << endl;
+    if ( !newIndex->Create() ) {
+        cerr << "BamRandomAccessController ERROR: could not create index for BAM file: "
+             << reader->Filename() << endl;
          return false;
      }
  
      // save new index
      SetIndex(newIndex);
  
-    // attempt to write new index file
-    if ( newIndex->Write(reader->Filename()) ) {
-        cerr << "BamRandomAccessController ERROR: could not save new index for BAM file: " << reader->Filename() << endl;
-        return false;
-    }
-
      // set new index's cache mode & return success
      newIndex->SetCacheMode(m_indexCacheMode);
      return true;
@@ -189,24 +184,28 @@ bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId)
      return m_index->HasAlignments(refId);
  }
  
-bool BamRandomAccessController::LocateIndex(const string& bamFilename,
+bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader,
                                              const BamIndex::IndexType& preferredType)
  {
      // look up index filename, deferring to preferredType if possible
-    const string& indexFilename = BamIndexFactory::FindIndexFilename(bamFilename, preferredType);
+    const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType);
  
      // if no index file found (of any type)
-    if ( indexFilename.empty() )
+    if ( indexFilename.empty() ) {
+        cerr << "BamRandomAccessController WARNING: "
+             << "could not find index file for BAM: "
+             << reader->Filename() << endl;
          return false;
+    }
  
      // otherwise open & use index file that was found
-    return OpenIndex(indexFilename);
+    return OpenIndex(indexFilename, reader);
  }
  
-bool BamRandomAccessController::OpenIndex(const string& indexFilename) {
+bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReaderPrivate* reader) {
  
      // attempt create new index of type based on filename
-    BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename);
+    BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader);
      if ( index == 0 ) {
          cerr << "BamRandomAccessController ERROR: could not create index for file: " << indexFilename << endl;
          return false;
@@ -270,5 +269,5 @@ bool BamRandomAccessController::SetRegion(BamReaderPrivate* reader,
      //    alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core]
      //    will not return data. BamMultiReader will still be able to successfully pull alignments
      //    from a region from multiple files even if one or more have no data.
-    return m_index->Jump(reader, m_region, &m_hasAlignmentsInRegion);
+    return m_index->Jump(m_region, &m_hasAlignmentsInRegion);
  }
diff --git a/src/api/internal/BamRandomAccessController_p.h b/src/api/internal/BamRandomAccessController_p.h

index e541cdf5a332a295b2b17ec74f1d4948b0504dfa..372ea4a5e9a300dfc24ff2e2f4ea7ccdac92f21a 100644 (file)
--- a/src/api/internal/BamRandomAccessController_p.h
+++ b/src/api/internal/BamRandomAccessController_p.h
@@ -56,8 +56,8 @@ class BamRandomAccessController {
          bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type);
          bool HasIndex(void) const;
          bool IndexHasAlignmentsForReference(const int& refId);
-        bool LocateIndex(const std::string& bamFilename, const BamIndex::IndexType& preferredType);
-        bool OpenIndex(const std::string& indexFilename);
+        bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType);
+        bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader);
          void SetIndex(BamIndex* index);
          void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode);
  
diff --git a/src/api/internal/BamReader_p.cpp b/src/api/internal/BamReader_p.cpp

index 441e8c0d7cc1f0b63c27aaaceb383044c83cfec7..c2afb4ca0b890c0429331823a8033c3aead0e351 100644 (file)
--- a/src/api/internal/BamReader_p.cpp
+++ b/src/api/internal/BamReader_p.cpp
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 21 March 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides the basic functionality for reading BAM files
  // ***************************************************************************
@@ -295,7 +295,7 @@ bool BamReaderPrivate::LoadReferenceData(void) {
  }
  
  bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) {
-    return m_randomAccessController.LocateIndex(m_filename, preferredType);
+    return m_randomAccessController.LocateIndex(this, preferredType);
  }
  
  // opens BAM file (and index)
@@ -326,7 +326,7 @@ bool BamReaderPrivate::Open(const string& filename) {
  }
  
  bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
-    return m_randomAccessController.OpenIndex(indexFilename);
+    return m_randomAccessController.OpenIndex(indexFilename, this);
  }
  
  // returns BAM file pointer to beginning of alignment data
@@ -349,6 +349,10 @@ bool BamReaderPrivate::Rewind(void) {
      return m_stream.Seek(m_alignmentsBeginOffset);
  }
  
+bool BamReaderPrivate::Seek(const int64_t& position) {
+    return m_stream.Seek(position);
+}
+
  void BamReaderPrivate::SetIndex(BamIndex* index) {
      m_randomAccessController.SetIndex(index);
  }
@@ -364,7 +368,6 @@ bool BamReaderPrivate::SetRegion(const BamRegion& region) {
      return m_randomAccessController.SetRegion(this, region, m_references.size());
  }
  
-// returns handle to internal BgzfStream
-BgzfStream* BamReaderPrivate::Stream(void) {
-    return &m_stream;
+int64_t BamReaderPrivate::Tell(void) const {
+    return m_stream.Tell();
  }
diff --git a/src/api/internal/BamReader_p.h b/src/api/internal/BamReader_p.h

index 7dda67f14569a5412a0eff3df9c0867fa9515902..c0d07d88a863d84efa81c96328e70778de892cf9 100644 (file)
--- a/src/api/internal/BamReader_p.h
+++ b/src/api/internal/BamReader_p.h
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 24 February 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides the basic functionality for reading BAM files
  // ***************************************************************************
@@ -70,11 +70,10 @@ class BamReaderPrivate {
          void SetIndex(BamIndex* index);
          void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode);
  
-    // BamReaderPrivate interface
-    public:
-        BgzfStream* Stream(void);
-
-    // 'internal' methods
+    // internal methods, but available as a BamReaderPrivate 'interface'
+    //
+    // these methods should only be used by BamTools::Internal classes
+    // (currently only used by the BamIndex subclasses)
      public:
          // retrieves header text from BAM file
          bool LoadHeaderData(void);
@@ -83,6 +82,10 @@ class BamReaderPrivate {
          bool LoadNextAlignment(BamAlignment& alignment);
          // builds reference data structure from BAM file
          bool LoadReferenceData(void);
+        // seek reader to file position
+        bool Seek(const int64_t& position);
+        // return reader's file position
+        int64_t Tell(void) const;
  
      // data members
      public:
diff --git a/src/api/internal/BamStandardIndex_p.cpp b/src/api/internal/BamStandardIndex_p.cpp

index cf0e2c1b0a2006d3ab33ebe5414fec45efab0a41..df4145db53bea11c4b2a3ae7103bf3a2d5e992cc 100644 (file)
--- a/src/api/internal/BamStandardIndex_p.cpp
+++ b/src/api/internal/BamStandardIndex_p.cpp
@@ -3,16 +3,14 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 21 March 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides index operations for the standardized BAM index format (".bai")
  // ***************************************************************************
  
  #include <api/BamAlignment.h>
-#include <api/BamReader.h>
  #include <api/internal/BamReader_p.h>
  #include <api/internal/BamStandardIndex_p.h>
-#include <api/internal/BgzfStream_p.h>
  using namespace BamTools;
  using namespace BamTools::Internal;
  
@@ -21,399 +19,464 @@ using namespace BamTools::Internal;
  #include <cstring>
  #include <algorithm>
  #include <iostream>
-#include <map>
  using namespace std;
  
-BamStandardIndex::BamStandardIndex(void)
-    : BamIndex()
-    , m_dataBeginOffset(0)
-    , m_hasFullDataCache(false)
+// static BamStandardIndex constants
+const int BamStandardIndex::MAX_BIN               = 37450;  // =(8^6-1)/7+1
+const int BamStandardIndex::BAM_LIDX_SHIFT        = 14;
+const string BamStandardIndex::BAI_EXTENSION      = ".bai";
+const char* const BamStandardIndex::BAI_MAGIC     = "BAI\1";
+const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2;
+const int BamStandardIndex::SIZEOF_BINCORE        = sizeof(uint32_t) + sizeof(int32_t);
+const int BamStandardIndex::SIZEOF_LINEAROFFSET   = sizeof(uint64_t);
+
+// ctor
+BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader)
+    : BamIndex(reader)
+    , m_indexStream(0)
+    , m_cacheMode(BamIndex::LimitedIndexCaching)
+    , m_buffer(0)
+    , m_bufferLength(0)
  {
-    m_isBigEndian = BamTools::SystemIsBigEndian();
+     m_isBigEndian = BamTools::SystemIsBigEndian();
  }
  
+// dtor
  BamStandardIndex::~BamStandardIndex(void) {
-    ClearAllData();
+    CloseFile();
  }
  
-// calculate bins that overlap region
-int BamStandardIndex::BinsFromRegion(const BamRegion& region,
-                                     const RefVector& references,
-                                     const bool isRightBoundSpecified,
-                                     uint16_t bins[MAX_BIN])
-{
-    // get region boundaries
-    uint32_t begin = (unsigned int)region.LeftPosition;
-    uint32_t end;
+bool BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) {
+
+    // retrieve references from reader
+    const RefVector& references = m_reader->GetReferenceData();
+
+    // make sure left-bound position is valid
+    if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
+        return false;
+
+    // set region 'begin'
+    begin = (unsigned int)region.LeftPosition;
  
      // if right bound specified AND left&right bounds are on same reference
-    // OK to use right bound position
-    if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
+    // OK to use right bound position as region 'end'
+    if ( region.isRightBoundSpecified() && ( region.LeftRefID == region.RightRefID ) )
          end = (unsigned int)region.RightPosition;
  
-    // otherwise, use end of left bound reference as cutoff
-    else
-        end = (unsigned int)references.at(region.LeftRefID).RefLength - 1;
+    // otherwise, set region 'end' to last reference base
+    else end = (unsigned int)references.at(region.LeftRefID).RefLength - 1;
  
-    // initialize list, bin '0' always a valid bin
-    int i = 0;
-    bins[i++] = 0;
+    // return success
+    return true;
+}
+
+void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin,
+                                              const uint32_t& end,
+                                              set<uint16_t>& candidateBins)
+{
+    // initialize list, bin '0' is always a valid bin
+    candidateBins.insert(0);
  
      // get rest of bins that contain this region
      unsigned int k;
-    for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { bins[i++] = k; }
-    for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { bins[i++] = k; }
-    for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { bins[i++] = k; }
-    for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { bins[i++] = k; }
-    for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
-
-    // return number of bins stored
-    return i;
+    for (k =    1 + (begin>>26); k <=    1 + (end>>26); ++k) { candidateBins.insert(k); }
+    for (k =    9 + (begin>>23); k <=    9 + (end>>23); ++k) { candidateBins.insert(k); }
+    for (k =   73 + (begin>>20); k <=   73 + (end>>20); ++k) { candidateBins.insert(k); }
+    for (k =  585 + (begin>>17); k <=  585 + (end>>17); ++k) { candidateBins.insert(k); }
+    for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); }
  }
  
-// creates index data (in-memory) from @reader data
-bool BamStandardIndex::Build(Internal::BamReaderPrivate* reader) {
-
-    // skip if invalid reader
-    if ( reader == 0 )
-        return false;
-
-    // skip if reader BgzfStream is invalid or not open
-    BgzfStream* bgzfStream = reader->Stream();
-    if ( bgzfStream == 0 || !bgzfStream->IsOpen )
+bool BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary,
+                                                 const uint64_t& minOffset,
+                                                 set<uint16_t>& candidateBins,
+                                                 vector<int64_t>& offsets)
+{
+    // attempt seek to first bin
+    if ( !Seek(refSummary.FirstBinFilePosition, SEEK_SET) )
          return false;
  
-    // move reader's file pointer to beginning of alignments
-    reader->Rewind();
-
-    // get reference count, reserve index space
-    const int numReferences = reader->GetReferenceCount();
-    m_indexData.clear();
-    m_hasFullDataCache = false;
-    SetReferenceCount(numReferences);
-
-    // sets default constant for bin, ID, offset, coordinate variables
-    const uint32_t defaultValue = 0xffffffffu;
-
-    // bin data
-    uint32_t saveBin(defaultValue);
-    uint32_t lastBin(defaultValue);
+    // iterate over reference bins
+    uint32_t binId;
+    int32_t numAlignmentChunks;
+    set<uint16_t>::iterator candidateBinIter;
+    for ( int i = 0; i < refSummary.NumBins; ++i ) {
  
-    // reference ID data
-    int32_t saveRefID(defaultValue);
-    int32_t lastRefID(defaultValue);
+        // read bin contents (if successful, alignment chunks are now in m_buffer)
+        if ( !ReadBinIntoBuffer(binId, numAlignmentChunks) )
+            return false;
  
-    // offset data
-    uint64_t saveOffset = bgzfStream->Tell();
-    uint64_t lastOffset = saveOffset;
+        // see if bin is a 'candidate bin'
+        candidateBinIter = candidateBins.find(binId);
  
-    // coordinate data
-    int32_t lastCoordinate = defaultValue;
+        // if not, move on to next bin
+        if ( candidateBinIter == candidateBins.end() )
+            continue;
  
-    BamAlignment bAlignment;
-    while ( reader->LoadNextAlignment(bAlignment) ) {
+        // otherwise, check bin's contents against for overlap
+        else {
  
-        // change of chromosome, save ID, reset bin
-        if ( lastRefID != bAlignment.RefID ) {
-            lastRefID = bAlignment.RefID;
-            lastBin   = defaultValue;
-        }
+            unsigned int offset = 0;
+            uint64_t chunkStart;
+            uint64_t chunkStop;
  
-        // if lastCoordinate greater than BAM position - file not sorted properly
-        else if ( lastCoordinate > bAlignment.Position ) {
-            fprintf(stderr, "BamStandardIndex ERROR: file not properly sorted:\n");
-            fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)",
-                    bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
-            exit(1);
-        }
+            // iterate over alignment chunks
+            for (int j = 0; j < numAlignmentChunks; ++j ) {
  
-        // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
-        if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
-
-            // save linear offset entry (matched to BAM entry refID)
-            BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID);
-            if ( indexIter == m_indexData.end() ) return false; // error
-            ReferenceIndex& refIndex = (*indexIter).second;
-            LinearOffsetVector& offsets = refIndex.Offsets;
-            SaveLinearOffset(offsets, bAlignment, lastOffset);
-        }
+                // read chunk start & stop from buffer
+                memcpy((char*)&chunkStart, m_buffer+offset, sizeof(uint64_t));
+                offset += sizeof(uint64_t);
+                memcpy((char*)&chunkStop, m_buffer, sizeof(uint64_t));
+                offset += sizeof(uint64_t);
  
-        // if current BamAlignment bin != lastBin, "then possibly write the binning index"
-        if ( bAlignment.Bin != lastBin ) {
-
-            // if not first time through
-            if ( saveBin != defaultValue ) {
+                // swap endian-ness if necessary
+                if ( m_isBigEndian ) {
+                    SwapEndian_64(chunkStart);
+                    SwapEndian_64(chunkStop);
+                }
  
-                // save Bam bin entry
-                BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
-                if ( indexIter == m_indexData.end() ) return false; // error
-                ReferenceIndex& refIndex = (*indexIter).second;
-                BamBinMap& binMap = refIndex.Bins;
-                SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
+                // store alignment chunk's start offset
+                // if its stop offset is larger than our 'minOffset'
+                if ( chunkStop > minOffset )
+                    offsets.push_back(chunkStart);
              }
  
-            // update saveOffset
-            saveOffset = lastOffset;
-
-            // update bin values
-            saveBin = bAlignment.Bin;
-            lastBin = bAlignment.Bin;
+            // 'pop' bin ID from candidate bins set
+            candidateBins.erase(candidateBinIter);
  
-            // update saveRefID
-            saveRefID = bAlignment.RefID;
-
-            // if invalid RefID, break out
-            if ( saveRefID < 0 ) break;
+            // quit if no more candidates
+            if ( candidateBins.empty() )
+                break;
          }
-
-        // make sure that current file pointer is beyond lastOffset
-        if ( bgzfStream->Tell() <= (int64_t)lastOffset ) {
-            fprintf(stderr, "BamStandardIndex ERROR: could not build index - calculating offsets failed.\n");
-            exit(1);
-        }
-
-        // update lastOffset
-        lastOffset = bgzfStream->Tell();
-
-        // update lastCoordinate
-        lastCoordinate = bAlignment.Position;
-    }
-
-    // save any leftover BAM data (as long as refID is valid)
-    if ( saveRefID >= 0 ) {
-        // save Bam bin entry
-        BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
-        if ( indexIter == m_indexData.end() ) return false; // error
-        ReferenceIndex& refIndex = (*indexIter).second;
-        BamBinMap& binMap = refIndex.Bins;
-        SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
      }
  
-    // simplify index by merging chunks
-    MergeChunks();
-
-    // iterate through references in index
-    // sort offsets in linear offset vector
-    BamStandardIndexData::iterator indexIter = m_indexData.begin();
-    BamStandardIndexData::iterator indexEnd  = m_indexData.end();
-    for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
+    // return success/failure on calculating at least 1 offset
+    return ( !offsets.empty() );
+}
  
-        // get reference index data
-        ReferenceIndex& refIndex = (*indexIter).second;
-        LinearOffsetVector& offsets = refIndex.Offsets;
+uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary,
+                                              const uint32_t& begin)
+{
+    // if no linear offsets exist, return 0
+    if ( refSummary.NumLinearOffsets == 0 )
+        return 0;
+
+    // if 'begin' starts beyond last linear offset, use the last linear offset as minimum
+    // else use the offset corresponding to the requested start position
+    const int shiftedBegin = begin>>BamStandardIndex::BAM_LIDX_SHIFT;
+    if ( shiftedBegin >= refSummary.NumLinearOffsets )
+        return LookupLinearOffset( refSummary, refSummary.NumLinearOffsets-1 );
+    else
+        return LookupLinearOffset( refSummary, shiftedBegin );
+}
  
-        // sort linear offsets
-        sort(offsets.begin(), offsets.end());
+void BamStandardIndex::CheckBufferSize(char*& buffer,
+                                       unsigned int& bufferLength,
+                                       const unsigned int& requestedBytes)
+{
+    try {
+        if ( requestedBytes > bufferLength ) {
+            bufferLength = requestedBytes + 10;
+            delete[] buffer;
+            buffer = new char[bufferLength];
+        }
+    } catch ( std::bad_alloc ) {
+        cerr << "BamStandardIndex ERROR: out of memory when allocating "
+             << requestedBytes << " byes" << endl;
+        exit(1);
      }
+}
  
-    // rewind reader's file pointer to beginning of alignments, return success/fail
-    return reader->Rewind();
+void BamStandardIndex::CheckBufferSize(unsigned char*& buffer,
+                                       unsigned int& bufferLength,
+                                       const unsigned int& requestedBytes)
+{
+    try {
+        if ( requestedBytes > bufferLength ) {
+            bufferLength = requestedBytes + 10;
+            delete[] buffer;
+            buffer = new unsigned char[bufferLength];
+        }
+    } catch ( std::bad_alloc ) {
+        cerr << "BamStandardIndex ERROR: out of memory when allocating "
+             << requestedBytes << " byes" << endl;
+        exit(1);
+    }
  }
  
-// check index file magic number, return true if OK
  bool BamStandardIndex::CheckMagicNumber(void) {
  
-    // read in magic number
+    // check 'magic number' to see if file is BAI index
      char magic[4];
      size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream);
+    if ( elementsRead != 4 ) {
+        cerr << "BamStandardIndex ERROR: could not read format 'magic number'" << endl;
+        return false;
+    }
  
      // compare to expected value
-    if ( strncmp(magic, "BAI\1", 4) != 0 ) {
-        fprintf(stderr, "BamStandardIndex ERROR: could not load index file - invalid magic number.\n");
-        fclose(m_indexStream);
+    if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 ) {
+        cerr << "BamStandardIndex ERROR: invalid format" << endl;
          return false;
      }
  
-    // return success/failure of load
-    return (elementsRead == 4);
+    // otherwise OK
+    return true;
  }
  
-// clear all current index offset data in memory
-void BamStandardIndex::ClearAllData(void) {
-    BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
-    BamStandardIndexData::const_iterator indexEnd  = m_indexData.end();
-    for ( ; indexIter != indexEnd; ++indexIter ) {
-        const int& refId = (*indexIter).first;
-        ClearReferenceOffsets(refId);
-    }
+void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) {
+    refEntry.ID = -1;
+    refEntry.Bins.clear();
+    refEntry.LinearOffsets.clear();
  }
  
-// clear all index offset data for desired reference
-void BamStandardIndex::ClearReferenceOffsets(const int& refId) {
+void BamStandardIndex::CloseFile(void) {
  
-    // look up refId, skip if not found
-    BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
-    if ( indexIter == m_indexData.end() ) return ;
+    // close file stream
+    if ( IsFileOpen() )
+        fclose(m_indexStream);
  
-    // clear reference data
-    ReferenceIndex& refEntry = (*indexIter).second;
-    refEntry.Bins.clear();
-    refEntry.Offsets.clear();
+    // clear index file summary data
+    m_indexFileSummary.clear();
  
-    // set flag
-    m_hasFullDataCache = false;
+    // clean up I/O buffer
+    delete[] m_buffer;
+    m_buffer = 0;
+    m_bufferLength = 0;
  }
  
-// return file position after header metadata
-off_t BamStandardIndex::DataBeginOffset(void) const {
-    return m_dataBeginOffset;
-}
+// builds index from associated BAM file & writes out to index file
+bool BamStandardIndex::Create(void) {
  
-// calculates offset(s) for a given region
-bool BamStandardIndex::GetOffsets(const BamRegion& region,
-                                  const RefVector& references,
-                                  const bool isRightBoundSpecified,
-                                  vector<int64_t>& offsets,
-                                  bool* hasAlignmentsInRegion)
-{
-    // return false if leftBound refID is not found in index data
-    if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
+    // return false if BamReader is invalid or not open
+    if ( m_reader == 0 || !m_reader->IsOpen() ) {
+        cerr << "BamStandardIndex ERROR: BamReader is not open"
+             << ", aborting index creation" << endl;
          return false;
+    }
  
-    // load index data for region if not already cached
-    if ( !IsDataLoaded(region.LeftRefID) ) {
-        bool loadedOk = true;
-        loadedOk &= SkipToReference(region.LeftRefID);
-        loadedOk &= LoadReference(region.LeftRefID);
-        if ( !loadedOk ) return false;
+    // rewind BamReader
+    if ( !m_reader->Rewind() ) {
+        cerr << "BamStandardIndex ERROR: could not rewind BamReader to create index"
+             << ", aborting index creation" << endl;
+        return false;
+    }
+
+    // open new index file (read & write)
+    string indexFilename = m_reader->Filename() + Extension();
+    if ( !OpenFile(indexFilename, "w+b") ) {
+        cerr << "BamStandardIndex ERROR: could not open ouput index file: " << indexFilename
+             << ", aborting index creation" << endl;
+        return false;
      }
  
-    // calculate which bins overlap this region
-    uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
-    int numBins = BinsFromRegion(region, references, isRightBoundSpecified, bins);
-
-    // get bins for this reference
-    BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
-    if ( indexIter == m_indexData.end() ) return false; // error
-    const ReferenceIndex& refIndex = (*indexIter).second;
-    const BamBinMap& binMap        = refIndex.Bins;
-
-    // get minimum offset to consider
-    const LinearOffsetVector& linearOffsets = refIndex.Offsets;
-    const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
-                              ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
-
-    // store all alignment 'chunk' starts (file offsets) for bins in this region
-    for ( int i = 0; i < numBins; ++i ) {
-
-        const uint16_t binKey = bins[i];
-        map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
-        if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
-
-            // iterate over chunks
-            const ChunkVector& chunks = (*binIter).second;
-            std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
-            std::vector<Chunk>::const_iterator chunksEnd  = chunks.end();
-            for ( ; chunksIter != chunksEnd; ++chunksIter) {
-
-                // if valid chunk found, store its file offset
-                const Chunk& chunk = (*chunksIter);
-                if ( chunk.Stop > minOffset )
-                    offsets.push_back( chunk.Start );
+    // initialize BaiFileSummary with number of references
+    const int& numReferences = m_reader->GetReferenceCount();
+    ReserveForSummary(numReferences);
+
+    // initialize output file
+    bool createdOk = true;
+    createdOk &= WriteHeader();
+
+    // set up bin, ID, offset, & coordinate markers
+    const uint32_t defaultValue = 0xffffffffu;
+    uint32_t currentBin    = defaultValue;
+    uint32_t lastBin       = defaultValue;
+    int32_t  currentRefID  = defaultValue;
+    int32_t  lastRefID     = defaultValue;
+    uint64_t currentOffset = (uint64_t)m_reader->Tell();
+    uint64_t lastOffset    = currentOffset;
+    int32_t  lastPosition  = defaultValue;
+
+    // iterate through alignments in BAM file
+    BamAlignment al;
+    BaiReferenceEntry refEntry;
+    while ( m_reader->LoadNextAlignment(al) ) {
+
+        // changed to new reference
+        if ( lastRefID != al.RefID ) {
+
+            // if not first reference, save previous reference data
+            if ( lastRefID != (int32_t)defaultValue ) {
+
+                SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+                createdOk &= WriteReferenceEntry(refEntry);
+                ClearReferenceEntry(refEntry);
+
+                // update bin markers
+                currentOffset = lastOffset;
+                currentBin    = al.Bin;
+                lastBin       = al.Bin;
+                currentRefID  = al.RefID;
              }
+
+            // update reference markers
+            refEntry.ID = al.RefID;
+            lastRefID   = al.RefID;
+            lastBin     = defaultValue;
          }
-    }
  
-    // clean up memory
-    free(bins);
+        // if lastPosition greater than current alignment position - file not sorted properly
+        else if ( lastPosition > al.Position ) {
+            cerr << "BamStandardIndex ERROR: BAM file is not properly sorted by coordinate"
+                 << ", aborting index creation"
+                 << endl
+                 << "At alignment: " << al.Name
+                 << " : previous position " << lastPosition
+                 << " > this alignment position " << al.Position
+                 << " on reference id: " << al.RefID << endl;
+            return false;
+        }
  
-    // sort the offsets before returning
-    sort(offsets.begin(), offsets.end());
+        // if alignment's ref ID is valid & its bin is not a 'leaf'
+        if ( (al.RefID >= 0) && (al.Bin < 4681) )
+            SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset);
  
-    // set flag & return success
-    *hasAlignmentsInRegion = (offsets.size() != 0 );
+        // changed to new BAI bin
+        if ( al.Bin != lastBin ) {
  
-    // if cache mode set to none, dump the data we just loaded
-    if ( m_cacheMode == BamIndex::NoIndexCaching )
-        ClearReferenceOffsets(region.LeftRefID);
+            // if not first bin on reference, save previous bin data
+            if ( currentBin != defaultValue )
+                SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
  
-    // return succes
-    return true;
-}
+            // update markers
+            currentOffset = lastOffset;
+            currentBin    = al.Bin;
+            lastBin       = al.Bin;
+            currentRefID  = al.RefID;
  
-// returns whether reference has alignments or no
-bool BamStandardIndex::HasAlignments(const int& refId) const {
-    BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
-    if ( indexIter == m_indexData.end() ) return false; // error
-    const ReferenceIndex& refEntry = (*indexIter).second;
-    return refEntry.HasAlignments;
+            // if invalid RefID, break out
+            if ( currentRefID < 0 )
+                break;
+        }
+
+        // make sure that current file pointer is beyond lastOffset
+        if ( m_reader->Tell() <= (int64_t)lastOffset ) {
+            cerr << "BamStandardIndex ERROR: calculating offsets failed"
+                 << ", aborting index creation" << endl;
+            return false;
+        }
+
+        // update lastOffset & lastPosition
+        lastOffset   = m_reader->Tell();
+        lastPosition = al.Position;
+    }
+
+    // store last alignment chunk to its bin, then write last reference entry
+    if ( currentRefID >= 0 ) {
+        SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+        createdOk &= WriteReferenceEntry(refEntry);
+    }
+
+    // rewind reader now that we're done building
+    createdOk &= m_reader->Rewind();
+
+    // return result
+    return createdOk;
  }
  
-// return true if all index data is cached
-bool BamStandardIndex::HasFullDataCache(void) const {
-    return m_hasFullDataCache;
+// returns format's file extension
+const string BamStandardIndex::Extension(void) {
+    return BamStandardIndex::BAI_EXTENSION;
  }
  
-// returns true if index cache has data for desired reference
-bool BamStandardIndex::IsDataLoaded(const int& refId) const {
+bool BamStandardIndex::GetOffsets(const BamRegion& region, vector<int64_t>& offsets) {
  
-    // look up refId, return false if not found
-    BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
-    if ( indexIter == m_indexData.end() ) return false;
+    // cannot calculate offsets if unknown/invalid reference ID requested
+    if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
+        return false;
  
-    // see if reference has alignments
-    // if not, it's not a problem to have no offset data
-    const ReferenceIndex& refEntry = (*indexIter).second;
-    if ( !refEntry.HasAlignments ) return true;
+    // retrieve index summary for left bound reference
+    const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID);
  
-    // return whether bin map contains data
-    return ( !refEntry.Bins.empty() );
-}
+    // set up region boundaries based on actual BamReader data
+    uint32_t begin;
+    uint32_t end;
+    if ( !AdjustRegion(region, begin, end) )
+        return false;
  
-// attempts to use index to jump to region; returns success/fail
-bool BamStandardIndex::Jump(Internal::BamReaderPrivate* reader,
-                            const BamTools::BamRegion& region,
-                            bool *hasAlignmentsInRegion)
-{
-    // skip if invalid reader
-    if ( reader == 0 )
+    // retrieve all candidate bin IDs for region
+    set<uint16_t> candidateBins;
+    CalculateCandidateBins(begin, end, candidateBins);
+
+    // use reference's linear offsets to calculate the minimum offset
+    // that must be considered to find overlap
+    const uint64_t& minOffset = CalculateMinOffset(refSummary, begin);
+
+    // attempt to use reference summary, minOffset, & candidateBins to calculate offsets
+    if ( !CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets) )
          return false;
  
-    // skip if reader BgzfStream is invalid or not open
-    BgzfStream* bgzfStream = reader->Stream();
-    if ( bgzfStream == 0 || !bgzfStream->IsOpen )
+    // ensure that offsets are sorted before returning
+    sort( offsets.begin(), offsets.end() );
+
+    // return succes
+    return true;
+}
+
+// returns whether reference has alignments or no
+bool BamStandardIndex::HasAlignments(const int& referenceID) const {
+    if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
          return false;
+    const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
+    return ( refSummary.NumBins > 0 );
+}
  
-    // retrieve references from reader
-    const RefVector references = reader->GetReferenceData();
+bool BamStandardIndex::IsFileOpen(void) const {
+    return ( m_indexStream != 0 );
+}
  
-    // make sure left-bound position is valid
-    if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
+// attempts to use index data to jump to @region, returns success/fail
+// a "successful" jump indicates no error, but not whether this region has data
+//   * thus, the method sets a flag to indicate whether there are alignments
+//     available after the jump position
+bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
+
+    // skip if reader is not valid or is not open
+    if ( m_reader == 0 || !m_reader->IsOpen() )
          return false;
  
      // calculate offsets for this region
      // if failed, print message, set flag, and return failure
      vector<int64_t> offsets;
-    if ( !GetOffsets(region, references, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
-        fprintf(stderr, "BamStandardIndex ERROR: could not jump - unable to calculate offset candidates for specified region.\n");
+    if ( !GetOffsets(region, offsets) ) {
+        cerr << "BamStandardIndex ERROR: could not jump"
+             << ", unable to retrieve offsets for region" << endl;
          *hasAlignmentsInRegion = false;
          return false;
      }
  
-    // iterate through offsets
-    BamAlignment alignment;
+    // if no offsets retrieved, set flag
+    if ( offsets.empty() )
+        *hasAlignmentsInRegion = false;
+
+    // iterate through candidate offsets
+    BamAlignment al;
      bool result = true;
-    for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
+    vector<int64_t>::const_iterator offsetIter = offsets.begin();
+    vector<int64_t>::const_iterator offsetEnd  = offsets.end();
+    for ( ; offsetIter != offsetEnd; ++offsetIter) {
  
          // attempt seek & load first available alignment
          // set flag to true if data exists
-        result &= bgzfStream->Seek(*o);
-        *hasAlignmentsInRegion = reader->GetNextAlignmentCore(alignment);
+        result &= m_reader->Seek(*offsetIter);
+        *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al);
  
          // if this alignment corresponds to desired position
          // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
-        if ( ((alignment.RefID == region.LeftRefID) &&
-             ((alignment.Position + alignment.Length) > region.LeftPosition)) ||
-             (alignment.RefID > region.LeftRefID) )
+        if ( ((al.RefID == region.LeftRefID) &&
+             ((al.Position + al.Length) > region.LeftPosition)) ||
+             (al.RefID > region.LeftRefID) )
          {
-            if ( o != offsets.begin() ) --o;
-                return bgzfStream->Seek(*o);
+            if ( offsetIter != offsets.begin() )
+                --offsetIter;
+            return m_reader->Seek(*offsetIter);
          }
      }
  
      // if error in jumping, print message & set flag
      if ( !result ) {
-        fprintf(stderr, "BamStandardIndex ERROR: could not jump - unable to determine correct offset for specified region.\n");
+        cerr << "BamStandardIndex ERROR: could not jump"
+             << ", there was a problem seeking in BAM file" << endl;
          *hasAlignmentsInRegion = false;
      }
  
@@ -421,313 +484,202 @@ bool BamStandardIndex::Jump(Internal::BamReaderPrivate* reader,
      return result;
  }
  
-// clears index data from all references except the first
-void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) {
-    BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
-    KeepOnlyReferenceOffsets((*indexBegin).first);
-}
+// loads existing data from file into memory
+bool BamStandardIndex::Load(const std::string& filename) {
  
-// clears index data from all references except the one specified
-void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) {
-    BamStandardIndexData::iterator mapIter = m_indexData.begin();
-    BamStandardIndexData::iterator mapEnd  = m_indexData.end();
-    for ( ; mapIter != mapEnd; ++mapIter ) {
-        const int entryRefId = (*mapIter).first;
-        if ( entryRefId != refId )
-            ClearReferenceOffsets(entryRefId);
+    // attempt open index file (read-only)
+    if ( !OpenFile(filename, "rb") ) {
+        cerr << "BamStandardIndex ERROR: could not open input index file: " << filename
+             << ", aborting index load" << endl;
+        return false;
      }
-}
-
-bool BamStandardIndex::LoadAllReferences(bool saveData) {
-
-    // skip if data already loaded
-    if ( m_hasFullDataCache ) return true;
  
-    // get number of reference sequences
-    uint32_t numReferences;
-    if ( !LoadReferenceCount((int&)numReferences) )
+    // if invalid format 'magic number', close & return failure
+    if ( !CheckMagicNumber() ) {
+        CloseFile();
          return false;
+    }
  
-    // iterate over reference entries
-    bool loadedOk = true;
-    for ( int i = 0; i < (int)numReferences; ++i )
-        loadedOk &= LoadReference(i, saveData);
-
-    // set flag
-    if ( loadedOk && saveData )
-        m_hasFullDataCache = true;
-
-    // return success/failure of loading references
-    return loadedOk;
+    // attempt to load index file summary, return success/failure
+    return SummarizeIndexFile();
  }
  
-// load header data from index file, return true if loaded OK
-bool BamStandardIndex::LoadHeader(void) {
-
-    bool loadedOk = CheckMagicNumber();
+uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) {
  
-    // store offset of beginning of data
-    m_dataBeginOffset = ftell64(m_indexStream);
+    // attempt seek to proper index file position
+    const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition +
+                                             index*BamStandardIndex::SIZEOF_LINEAROFFSET;
+    if ( !Seek(linearOffsetFilePosition, SEEK_SET) )
+        return 0;
  
-    // return success/failure of load
-    return loadedOk;
+    // read linear offset from BAI file
+    uint64_t linearOffset(0);
+    if ( !ReadLinearOffset(linearOffset) )
+        return 0;
+    return linearOffset;
  }
  
-// load a single index bin entry from file, return true if loaded OK
-// @saveData - save data in memory if true, just read & discard if false
-bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) {
+void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) {
  
-    size_t elementsRead = 0;
+    // skip if chunks are empty, nothing to merge
+    if ( chunks.empty() )
+        return;
  
-    // get bin ID
-    uint32_t binId;
-    elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream);
-    if ( m_isBigEndian ) SwapEndian_32(binId);
-
-    // load alignment chunks for this bin
-    ChunkVector chunks;
-    bool chunksOk = LoadChunks(chunks, saveData);
+    // set up merged alignment chunk container
+    BaiAlignmentChunkVector mergedChunks;
+    mergedChunks.push_back( chunks[0] );
  
-    // store bin entry
-    if ( chunksOk && saveData )
-        refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
-
-    // return success/failure of load
-    return ( (elementsRead == 1) && chunksOk );
-}
+    // iterate over chunks
+    int i = 0;
+    BaiAlignmentChunkVector::iterator chunkIter = chunks.begin();
+    BaiAlignmentChunkVector::iterator chunkEnd  = chunks.end();
+    for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
  
-bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) {
+        // get 'currentMergeChunk' based on numeric index
+        BaiAlignmentChunk& currentMergeChunk = mergedChunks[i];
  
-    size_t elementsRead = 0;
+        // get sourceChunk based on source vector iterator
+        BaiAlignmentChunk& sourceChunk = (*chunkIter);
  
-    // get number of bins
-    int32_t numBins;
-    elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream);
-    if ( m_isBigEndian ) SwapEndian_32(numBins);
+        // if currentMergeChunk ends where sourceChunk starts, then merge the two
+        if ( currentMergeChunk.Stop>>16 == sourceChunk.Start>>16 )
+            currentMergeChunk.Stop = sourceChunk.Stop;
  
-    // set flag
-    refEntry.HasAlignments = ( numBins != 0 );
+        // otherwise
+        else {
+            // append sourceChunk after currentMergeChunk
+            mergedChunks.push_back(sourceChunk);
  
-    // iterate over bins
-    bool binsOk = true;
-    for ( int i = 0; i < numBins; ++i )
-        binsOk &= LoadBin(refEntry, saveData);
+            // update i, so the next iteration will consider the
+            // recently-appended sourceChunk as new mergeChunk candidate
+            ++i;
+        }
+    }
  
-    // return success/failure of load
-    return ( (elementsRead == 1) && binsOk );
+    // saved newly-merged chunks into (parameter) chunks
+    chunks = mergedChunks;
  }
  
-// load a single index bin entry from file, return true if loaded OK
-// @saveData - save data in memory if true, just read & discard if false
-bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) {
+bool BamStandardIndex::OpenFile(const std::string& filename, const char* mode) {
  
-    size_t elementsRead = 0;
-
-    // read in chunk data
-    uint64_t start;
-    uint64_t stop;
-    elementsRead += fread(&start, sizeof(start), 1, m_indexStream);
-    elementsRead += fread(&stop,  sizeof(stop),  1, m_indexStream);
+    // make sure any previous index file is closed
+    CloseFile();
  
-    // swap endian-ness if necessary
-    if ( m_isBigEndian ) {
-        SwapEndian_64(start);
-        SwapEndian_64(stop);
-    }
-
-    // save data if requested
-    if ( saveData ) chunks.push_back( Chunk(start, stop) );
-
-    // return success/failure of load
-    return ( elementsRead == 2 );
+    // attempt to open file
+    m_indexStream = fopen(filename.c_str(), mode);
+    return IsFileOpen();
  }
  
-bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) {
-
+bool BamStandardIndex::ReadBinID(uint32_t& binId) {
      size_t elementsRead = 0;
-
-    // read in number of chunks
-    uint32_t numChunks;
-    elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream);
-    if ( m_isBigEndian ) SwapEndian_32(numChunks);
-
-    // initialize space for chunks if we're storing this data
-    if ( saveData ) chunks.reserve(numChunks);
-
-    // iterate over chunks
-    bool chunksOk = true;
-    for ( int i = 0; i < (int)numChunks; ++i )
-        chunksOk &= LoadChunk(chunks, saveData);
-
-    // sort chunk vector
-    sort( chunks.begin(), chunks.end(), ChunkLessThan );
-
-    // return success/failure of load
-    return ( (elementsRead == 1) && chunksOk );
+    elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream);
+    if ( m_isBigEndian ) SwapEndian_32(binId);
+    return ( elementsRead == 1 );
  }
  
-// load a single index linear offset entry from file, return true if loaded OK
-// @saveData - save data in memory if true, just read & discard if false
-bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
+bool BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) {
  
-    size_t elementsRead = 0;
+    bool readOk = true;
  
-    // read in number of linear offsets
-    int32_t numLinearOffsets;
-    elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream);
-    if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
+    // read bin header
+    readOk &= ReadBinID(binId);
+    readOk &= ReadNumAlignmentChunks(numAlignmentChunks);
  
-    // set up destination vector (if we're saving the data)
-    LinearOffsetVector linearOffsets;
-    if ( saveData ) linearOffsets.reserve(numLinearOffsets);
+    // read bin contents
+    const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK;
+    readOk &= ReadIntoBuffer(bytesRequested);
  
-    // iterate over linear offsets
-    uint64_t linearOffset;
-    for ( int i = 0; i < numLinearOffsets; ++i ) {
-        elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream);
-        if ( m_isBigEndian ) SwapEndian_64(linearOffset);
-        if ( saveData ) linearOffsets.push_back(linearOffset);
-    }
+    // return success/failure
+    return readOk;
+}
  
-    // sort linear offsets
-    sort ( linearOffsets.begin(), linearOffsets.end() );
+bool BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) {
  
-    // save in reference index entry if desired
-    if ( saveData ) refEntry.Offsets = linearOffsets;
+    // ensure that our buffer is big enough for request
+    BamStandardIndex::CheckBufferSize(m_buffer, m_bufferLength, bytesRequested);
  
-    // return success/failure of load
-    return ( elementsRead == (size_t)(numLinearOffsets + 1) );
+    // read from BAI file stream
+    size_t bytesRead = fread( m_buffer, sizeof(char), bytesRequested, m_indexStream );
+    return ( bytesRead == (size_t)bytesRequested );
  }
  
-bool BamStandardIndex::LoadFirstReference(bool saveData) {
-    BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
-    return LoadReference((*indexBegin).first, saveData);
+bool BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) {
+    size_t elementsRead = 0;
+    elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream);
+    if ( m_isBigEndian ) SwapEndian_64(linearOffset);
+    return ( elementsRead == 1 );
  }
  
-// load a single reference from file, return true if loaded OK
-// @saveData - save data in memory if true, just read & discard if false
-bool BamStandardIndex::LoadReference(const int& refId, bool saveData) {
-
-    // look up refId
-    BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
-
-    // if reference not previously loaded, create new entry
-    if ( indexIter == m_indexData.end() ) {
-        ReferenceIndex newEntry;
-        newEntry.HasAlignments = false;
-        m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
-    }
-
-    // load reference data
-    indexIter = m_indexData.find(refId);
-    ReferenceIndex& entry = (*indexIter).second;
-    bool loadedOk = true;
-    loadedOk &= LoadBins(entry, saveData);
-    loadedOk &= LoadLinearOffsets(entry, saveData);
-    return loadedOk;
+bool BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) {
+    size_t elementsRead = 0;
+    elementsRead += fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, m_indexStream);
+    if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks);
+    return ( elementsRead == 1 );
  }
  
-// loads number of references, return true if loaded OK
-bool BamStandardIndex::LoadReferenceCount(int& numReferences) {
+bool BamStandardIndex::ReadNumBins(int& numBins) {
+    size_t elementsRead = 0;
+    elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream);
+    if ( m_isBigEndian ) SwapEndian_32(numBins);
+    return ( elementsRead == 1 );
+}
  
+bool BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) {
      size_t elementsRead = 0;
+    elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream);
+    if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
+    return ( elementsRead == 1 );
+}
  
-    // read reference count
+bool BamStandardIndex::ReadNumReferences(int& numReferences) {
+    size_t elementsRead = 0;
      elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
      if ( m_isBigEndian ) SwapEndian_32(numReferences);
-
-    // return success/failure of load
      return ( elementsRead == 1 );
  }
  
-// merges 'alignment chunks' in BAM bin (used for index building)
-void BamStandardIndex::MergeChunks(void) {
-
-    // iterate over reference enties
-    BamStandardIndexData::iterator indexIter = m_indexData.begin();
-    BamStandardIndexData::iterator indexEnd  = m_indexData.end();
-    for ( ; indexIter != indexEnd; ++indexIter ) {
-
-        // get BAM bin map for this reference
-        ReferenceIndex& refIndex = (*indexIter).second;
-        BamBinMap& bamBinMap = refIndex.Bins;
-
-        // iterate over BAM bins
-        BamBinMap::iterator binIter = bamBinMap.begin();
-        BamBinMap::iterator binEnd  = bamBinMap.end();
-        for ( ; binIter != binEnd; ++binIter ) {
-
-            // get chunk vector for this bin
-            ChunkVector& binChunks = (*binIter).second;
-            if ( binChunks.size() == 0 ) continue;
-
-            ChunkVector mergedChunks;
-            mergedChunks.push_back( binChunks[0] );
-
-            // iterate over chunks
-            int i = 0;
-            ChunkVector::iterator chunkIter = binChunks.begin();
-            ChunkVector::iterator chunkEnd  = binChunks.end();
-            for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
-
-                // get 'currentChunk' based on numeric index
-                Chunk& currentChunk = mergedChunks[i];
-
-                // get iteratorChunk based on vector iterator
-                Chunk& iteratorChunk = (*chunkIter);
-
-                // if chunk ends where (iterator) chunk starts, then merge
-                if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
-                    currentChunk.Stop = iteratorChunk.Stop;
-
-                // otherwise
-                else {
-                    // set currentChunk + 1 to iteratorChunk
-                    mergedChunks.push_back(iteratorChunk);
-                    ++i;
-                }
-            }
-
-            // saved merged chunk vector
-            (*binIter).second = mergedChunks;
-        }
-    }
+void BamStandardIndex::ReserveForSummary(const int& numReferences) {
+    m_indexFileSummary.clear();
+    m_indexFileSummary.assign( numReferences, BaiReferenceSummary() );
  }
  
-// saves BAM bin entry for index
-void BamStandardIndex::SaveBinEntry(BamBinMap& binMap,
-                                   const uint32_t& saveBin,
-                                   const uint64_t& saveOffset,
-                                   const uint64_t& lastOffset)
+void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap,
+                                    const uint32_t& currentBin,
+                                    const uint64_t& currentOffset,
+                                    const uint64_t& lastOffset)
  {
-    // look up saveBin
-    BamBinMap::iterator binIter = binMap.find(saveBin);
-
-    // create new chunk
-    Chunk newChunk(saveOffset, lastOffset);
+    // create new alignment chunk
+    BaiAlignmentChunk newChunk(currentOffset, lastOffset);
  
-    // if entry doesn't exist
+    // if no entry exists yet for this bin, create one and store alignment chunk
+    BaiBinMap::iterator binIter = binMap.find(currentBin);
      if ( binIter == binMap.end() ) {
-        ChunkVector newChunks;
+        BaiAlignmentChunkVector newChunks;
          newChunks.push_back(newChunk);
-        binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
+        binMap.insert( pair<uint32_t, BaiAlignmentChunkVector>(currentBin, newChunks));
      }
  
-    // otherwise
+    // otherwise, just append alignment chunk
      else {
-        ChunkVector& binChunks = (*binIter).second;
+        BaiAlignmentChunkVector& binChunks = (*binIter).second;
          binChunks.push_back( newChunk );
      }
  }
  
-// saves linear offset entry for index
-void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets,
-                                       const BamAlignment& bAlignment,
-                                       const uint64_t&     lastOffset)
+void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) {
+    BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
+    refSummary.NumBins = numBins;
+    refSummary.FirstBinFilePosition = Tell();
+}
+
+void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets,
+                                             const int& alignmentStartPosition,
+                                             const int& alignmentStopPosition,
+                                             const uint64_t& lastOffset)
  {
      // get converted offsets
-    int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
-    int endOffset   = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
+    const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT;
+    const int endOffset   = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT;
  
      // resize vector if necessary
      int oldSize = offsets.size();
@@ -739,120 +691,108 @@ void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets,
      for( int i = beginOffset + 1; i <= endOffset; ++i ) {
          if ( offsets[i] == 0 )
              offsets[i] = lastOffset;
-        }
+    }
  }
  
-// initializes index data structure to hold @count references
-void BamStandardIndex::SetReferenceCount(const int& count) {
-    for ( int i = 0; i < count; ++i )
-        m_indexData[i].HasAlignments = false;
+void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) {
+    BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
+    refSummary.NumLinearOffsets = numLinearOffsets;
+    refSummary.FirstLinearOffsetFilePosition = Tell();
  }
  
-bool BamStandardIndex::SkipToFirstReference(void) {
-    BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
-    return SkipToReference( (*indexBegin).first );
+// seek to position in index file stream
+bool BamStandardIndex::Seek(const int64_t& position, const int& origin) {
+    return ( fseek64(m_indexStream, position, origin) == 0 );
  }
  
-// position file pointer to desired reference begin, return true if skipped OK
-bool BamStandardIndex::SkipToReference(const int& refId) {
-
-    // attempt rewind
-    if ( !Rewind() ) return false;
-
-    // read in number of references
-    uint32_t numReferences;
-    size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
-    if ( elementsRead != 1 ) return false;
-    if ( m_isBigEndian ) SwapEndian_32(numReferences);
+// change the index caching behavior
+void BamStandardIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) {
+    m_cacheMode = mode;
+    // do nothing else here ? cache mode will be ignored from now on, most likely
+}
  
-    // iterate over reference entries
+bool BamStandardIndex::SkipBins(const int& numBins) {
+    uint32_t binId;
+    int32_t numAlignmentChunks;
      bool skippedOk = true;
-    int currentRefId = 0;
-    while (currentRefId != refId) {
-        skippedOk &= LoadReference(currentRefId, false);
-        ++currentRefId;
-    }
-
-    // return success
+    for (int i = 0; i < numBins; ++i)
+        skippedOk &= ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored
      return skippedOk;
  }
  
-// write header to new index file
-bool BamStandardIndex::WriteHeader(void) {
-
-    size_t elementsWritten = 0;
-
-    // write magic number
-    elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream);
-
-    // store offset of beginning of data
-    m_dataBeginOffset = ftell64(m_indexStream);
-
-    // return success/failure of write
-    return (elementsWritten == 4);
+bool BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) {
+    const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET;
+    return ReadIntoBuffer(bytesRequested);
  }
  
-// write index data for all references to new index file
-bool BamStandardIndex::WriteAllReferences(void) {
+void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) {
+    sort( linearOffsets.begin(), linearOffsets.end() );
+}
  
-    size_t elementsWritten = 0;
+bool BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) {
  
-    // write number of reference sequences
-    int32_t numReferenceSeqs = m_indexData.size();
-    if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
-    elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream);
+    // load number of bins
+    int numBins;
+    if ( !ReadNumBins(numBins) )
+        return false;
  
-    // iterate over reference sequences
-    bool refsOk = true;
-    BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
-    BamStandardIndexData::const_iterator indexEnd  = m_indexData.end();
-    for ( ; indexIter != indexEnd; ++ indexIter )
-        refsOk &= WriteReference( (*indexIter).second );
+    // store bins summary for this reference
+    refSummary.NumBins = numBins;
+    refSummary.FirstBinFilePosition = Tell();
  
-    // return success/failure of write
-    return ( (elementsWritten == 1) && refsOk );
+    // attempt skip reference bins, return success/failure
+    return SkipBins(numBins);
  }
  
-// write index data for bin to new index file
-bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) {
+bool BamStandardIndex::SummarizeIndexFile(void) {
  
-    size_t elementsWritten = 0;
+    // load number of reference sequences
+    int numReferences;
+    if ( !ReadNumReferences(numReferences) )
+        return false;
  
-    // write BAM bin ID
-    uint32_t binKey = binId;
-    if ( m_isBigEndian ) SwapEndian_32(binKey);
-    elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream);
+    // initialize file summary data
+    ReserveForSummary(numReferences);
  
-    // write chunks
-    bool chunksOk = WriteChunks(chunks);
+    // iterate over reference entries
+    bool loadedOk = true;
+    BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
+    BaiFileSummary::iterator summaryEnd  = m_indexFileSummary.end();
+    for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i )
+        loadedOk &= SummarizeReference(*summaryIter);
  
-    // return success/failure of write
-    return ( (elementsWritten == 1) && chunksOk );
+    // return result
+    return loadedOk;
  }
  
-// write index data for bins to new index file
-bool BamStandardIndex::WriteBins(const BamBinMap& bins) {
+bool BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) {
  
-    size_t elementsWritten = 0;
+    // load number of linear offsets
+    int numLinearOffsets;
+    if ( !ReadNumLinearOffsets(numLinearOffsets) )
+        return false;
  
-    // write number of bins
-    int32_t binCount = bins.size();
-    if ( m_isBigEndian ) SwapEndian_32(binCount);
-    elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream);
+    // store bin summary data for this reference
+    refSummary.NumLinearOffsets = numLinearOffsets;
+    refSummary.FirstLinearOffsetFilePosition = Tell();
  
-    // iterate over bins
-    bool binsOk = true;
-    BamBinMap::const_iterator binIter = bins.begin();
-    BamBinMap::const_iterator binEnd  = bins.end();
-    for ( ; binIter != binEnd; ++binIter )
-        binsOk &= WriteBin( (*binIter).first, (*binIter).second );
+    // skip linear offsets in index file
+    return SkipLinearOffsets(numLinearOffsets);
+}
  
-    // return success/failure of write
-    return ( (elementsWritten == 1) && binsOk );
+bool BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) {
+    bool loadedOk = true;
+    loadedOk &= SummarizeBins(refSummary);
+    loadedOk &= SummarizeLinearOffsets(refSummary);
+    return loadedOk;
  }
  
-// write index data for chunk entry to new index file
-bool BamStandardIndex::WriteChunk(const Chunk& chunk) {
+// return position of file pointer in index file stream
+int64_t BamStandardIndex::Tell(void) const {
+    return ftell64(m_indexStream);
+}
+
+bool BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) {
  
      size_t elementsWritten = 0;
  
@@ -874,8 +814,10 @@ bool BamStandardIndex::WriteChunk(const Chunk& chunk) {
      return ( elementsWritten == 2 );
  }
  
-// write index data for chunk entry to new index file
-bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) {
+bool BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) {
+
+    // make sure chunks are merged (simplified) before writing & saving summary
+    MergeAlignmentChunks(chunks);
  
      size_t elementsWritten = 0;
  
@@ -886,28 +828,88 @@ bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) {
  
      // iterate over chunks
      bool chunksOk = true;
-    ChunkVector::const_iterator chunkIter = chunks.begin();
-    ChunkVector::const_iterator chunkEnd  = chunks.end();
+    BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin();
+    BaiAlignmentChunkVector::const_iterator chunkEnd  = chunks.end();
      for ( ; chunkIter != chunkEnd; ++chunkIter )
-        chunksOk &= WriteChunk( (*chunkIter) );
+        chunksOk &= WriteAlignmentChunk( (*chunkIter) );
+
+    // return success/failure of write
+    return ( (elementsWritten == 1) && chunksOk );
+}
+
+bool BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) {
+
+    size_t elementsWritten = 0;
+
+    // write BAM bin ID
+    uint32_t binKey = binId;
+    if ( m_isBigEndian ) SwapEndian_32(binKey);
+    elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream);
+
+    // write bin's alignment chunks
+    bool chunksOk = WriteAlignmentChunks(chunks);
  
      // return success/failure of write
      return ( (elementsWritten == 1) && chunksOk );
  }
  
-// write index data for linear offsets entry to new index file
-bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) {
+bool BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) {
+
+    size_t elementsWritten = 0;
+
+    // write number of bins
+    int32_t binCount = bins.size();
+    if ( m_isBigEndian ) SwapEndian_32(binCount);
+    elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream);
+
+    // save summary for reference's bins
+    SaveBinsSummary(refId, bins.size());
+
+    // iterate over bins
+    bool binsOk = true;
+    BaiBinMap::iterator binIter = bins.begin();
+    BaiBinMap::iterator binEnd  = bins.end();
+    for ( ; binIter != binEnd; ++binIter )
+        binsOk &= WriteBin( (*binIter).first, (*binIter).second );
+
+    // return success/failure of write
+    return ( (elementsWritten == 1) && binsOk );
+}
+
+bool BamStandardIndex::WriteHeader(void) {
+
+    size_t elementsWritten = 0;
+
+    // write magic number
+    elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, m_indexStream);
+
+    // write number of reference sequences
+    int32_t numReferences = m_indexFileSummary.size();
+    if ( m_isBigEndian ) SwapEndian_32(numReferences);
+    elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
+
+    // return success/failure of write
+    return (elementsWritten == 5);
+}
+
+bool BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) {
+
+    // make sure linear offsets are sorted before writing & saving summary
+    SortLinearOffsets(linearOffsets);
  
      size_t elementsWritten = 0;
  
      // write number of linear offsets
-    int32_t offsetCount = offsets.size();
+    int32_t offsetCount = linearOffsets.size();
      if ( m_isBigEndian ) SwapEndian_32(offsetCount);
      elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream);
  
+    // save summary for reference's linear offsets
+    SaveLinearOffsetsSummary(refId, linearOffsets.size());
+
      // iterate over linear offsets
-    LinearOffsetVector::const_iterator offsetIter = offsets.begin();
-    LinearOffsetVector::const_iterator offsetEnd  = offsets.end();
+    BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin();
+    BaiLinearOffsetVector::const_iterator offsetEnd  = linearOffsets.end();
      for ( ; offsetIter != offsetEnd; ++offsetIter ) {
  
          // write linear offset
@@ -917,13 +919,12 @@ bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) {
      }
  
      // return success/failure of write
-    return ( elementsWritten == (size_t)(offsetCount + 1) );
+    return ( elementsWritten == (size_t)(linearOffsets.size() + 1) );
  }
  
-// write index data for a single reference to new index file
-bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) {
+bool BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) {
      bool refOk = true;
-    refOk &= WriteBins(refEntry.Bins);
-    refOk &= WriteLinearOffsets(refEntry.Offsets);
+    refOk &= WriteBins(refEntry.ID, refEntry.Bins);
+    refOk &= WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets);
      return refOk;
  }
diff --git a/src/api/internal/BamStandardIndex_p.h b/src/api/internal/BamStandardIndex_p.h

index 767606e03364b513a41365200d62b35de570f9e6..7c61a6296b1dd95b6da0b29ae27a19d943df5cd8 100644 (file)
--- a/src/api/internal/BamStandardIndex_p.h
+++ b/src/api/internal/BamStandardIndex_p.h
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 19 January 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides index operations for the standardized BAM index format (".bai")
  // ***************************************************************************
@@ -24,193 +24,211 @@
  #include <api/BamAux.h>
  #include <api/BamIndex.h>
  #include <map>
+#include <set>
  #include <string>
  #include <vector>
  
  namespace BamTools {
-
-class BamAlignment;
-
  namespace Internal {
  
-// BAM index constants
-const int MAX_BIN        = 37450;    // =(8^6-1)/7+1
-const int BAM_LIDX_SHIFT = 14;
-const std::string BAI_EXTENSION = ".bai";
+// -----------------------------------------------------------------------------
+// BamStandardIndex data structures
  
-// --------------------------------------------------
-// BamStandardIndex data structures & typedefs
-struct Chunk {
+// defines start and end of a contiguous run of alignments
+struct BaiAlignmentChunk {
  
      // data members
      uint64_t Start;
      uint64_t Stop;
  
      // constructor
-    Chunk(const uint64_t& start = 0,
-          const uint64_t& stop = 0)
+    BaiAlignmentChunk(const uint64_t& start = 0,
+                      const uint64_t& stop = 0)
          : Start(start)
          , Stop(stop)
      { }
  };
  
+// comparison operator (for sorting)
  inline
-bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
+bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) {
      return lhs.Start < rhs.Start;
  }
  
-typedef std::vector<Chunk> ChunkVector;
-typedef std::map<uint32_t, ChunkVector> BamBinMap;
-typedef std::vector<uint64_t> LinearOffsetVector;
+// convenience typedef for a list of all alignment 'chunks' in a BAI bin
+typedef std::vector<BaiAlignmentChunk> BaiAlignmentChunkVector;
+
+// convenience typedef for a map of all BAI bins in a reference (ID => chunks)
+typedef std::map<uint32_t, BaiAlignmentChunkVector> BaiBinMap;
+
+// convenience typedef for a list of all 'linear offsets' in a reference
+typedef std::vector<uint64_t> BaiLinearOffsetVector;
  
-struct ReferenceIndex {
+// contains all fields necessary for building, loading, & writing
+// full BAI index data for a single reference
+struct BaiReferenceEntry {
  
      // data members
-    BamBinMap Bins;
-    LinearOffsetVector Offsets;
-    bool HasAlignments;
+    int32_t ID;
+    BaiBinMap Bins;
+    BaiLinearOffsetVector LinearOffsets;
  
-    // constructor
-    ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
-                   const LinearOffsetVector& offsets = LinearOffsetVector(),
-                   const bool hasAlignments          = false)
-        : Bins(binMap)
-        , Offsets(offsets)
-        , HasAlignments(hasAlignments)
+    // ctor
+    BaiReferenceEntry(const int32_t& id = -1)
+        : ID(id)
      { }
  };
  
-typedef std::map<int32_t, ReferenceIndex> BamStandardIndexData;
+// provides (persistent) summary of BaiReferenceEntry's index data
+struct BaiReferenceSummary {
+
+    // data members
+    int NumBins;
+    int NumLinearOffsets;
+    uint64_t FirstBinFilePosition;
+    uint64_t FirstLinearOffsetFilePosition;
+
+    // ctor
+    BaiReferenceSummary(void)
+        : NumBins(0)
+        , NumLinearOffsets(0)
+        , FirstBinFilePosition(0)
+        , FirstLinearOffsetFilePosition(0)
+    { }
+};
+
+// convenience typedef for describing a full BAI index file summary
+typedef std::vector<BaiReferenceSummary> BaiFileSummary;
+
+// end BamStandardIndex data structures
+// -----------------------------------------------------------------------------
  
  class BamStandardIndex : public BamIndex {
  
      // ctor & dtor
      public:
-        BamStandardIndex(void);
+        BamStandardIndex(Internal::BamReaderPrivate* reader);
          ~BamStandardIndex(void);
  
-    // interface (implements BamIndex virtual methods)
+    // BamIndex implementation
      public:
-        // creates index data (in-memory) from @reader data
-        bool Build(Internal::BamReaderPrivate* reader);
-        // returns supported file extension
-        const std::string Extension(void) { return BAI_EXTENSION; }
+        // builds index from associated BAM file & writes out to index file
+        bool Create(void);
          // returns whether reference has alignments or no
          bool HasAlignments(const int& referenceID) const;
-        // attempts to use index to jump to @region in @reader; returns success/fail
+        // attempts to use index data to jump to @region, returns success/fail
          // a "successful" jump indicates no error, but not whether this region has data
          //   * thus, the method sets a flag to indicate whether there are alignments
          //     available after the jump position
-        bool Jump(Internal::BamReaderPrivate* reader,
-                  const BamTools::BamRegion& region,
-                  bool* hasAlignmentsInRegion);
-
-    public:
-        // clear all current index offset data in memory
-        void ClearAllData(void);
-        // return file position after header metadata
-        off_t DataBeginOffset(void) const;
-        // return true if all index data is cached
-        bool HasFullDataCache(void) const;
-        // clears index data from all references except the first
-        void KeepOnlyFirstReferenceOffsets(void);
-        // load index data for all references, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadAllReferences(bool saveData = true);
-        // load first reference from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadFirstReference(bool saveData = true);
-        // load header data from index file, return true if loaded OK
-        bool LoadHeader(void);
-        // position file pointer to first reference begin, return true if skipped OK
-        bool SkipToFirstReference(void);
-        // write index reference data
-        bool WriteAllReferences(void);
-        // write index header data
-        bool WriteHeader(void);
-
-    // 'internal' methods
+        bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+        // loads existing data from file into memory
+        bool Load(const std::string& filename);
+        // change the index caching behavior
+        void SetCacheMode(const BamIndex::IndexCacheMode& mode);
      public:
+        // returns format's file extension
+        static const std::string Extension(void);
  
-        // -----------------------
-        // index file operations
-
-        // check index file magic number, return true if OK
+    // internal file ops
+    private:
          bool CheckMagicNumber(void);
-        // check index file version, return true if OK
-        bool CheckVersion(void);
-        // load a single index bin entry from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
-        bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
-        // load a single index bin entry from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadChunk(ChunkVector& chunks, bool saveData = true);
-        bool LoadChunks(ChunkVector& chunks, bool saveData = true);
-        // load a single index linear offset entry from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
-        // load a single reference from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadReference(const int& refId, bool saveData = true);
-        // loads number of references, return true if loaded OK
-        bool LoadReferenceCount(int& numReferences);
-        // position file pointer to desired reference begin, return true if skipped OK
-        bool SkipToReference(const int& refId);
-        // write index data for bin to new index file
-        bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
-        // write index data for bins to new index file
-        bool WriteBins(const BamBinMap& bins);
-        // write index data for chunk entry to new index file
-        bool WriteChunk(const Chunk& chunk);
-        // write index data for chunk entry to new index file
-        bool WriteChunks(const ChunkVector& chunks);
-        // write index data for linear offsets entry to new index file
-        bool WriteLinearOffsets(const LinearOffsetVector& offsets);
-        // write index data single reference to new index file
-        bool WriteReference(const ReferenceIndex& refEntry);
-
-        // -----------------------
-        // index data operations
-
-        // calculate bins that overlap region
-        int BinsFromRegion(const BamRegion& region,
-                           const RefVector& references,
-                           const bool isRightBoundSpecified,
-                           uint16_t bins[MAX_BIN]);
-        // clear all index offset data for desired reference
-        void ClearReferenceOffsets(const int& refId);
-        // calculates offset(s) for a given region
-        bool GetOffsets(const BamRegion& region,
-                        const RefVector& references,
-                        const bool isRightBoundSpecified,
-                        std::vector<int64_t>& offsets,
-                        bool* hasAlignmentsInRegion);
-        // returns true if index cache has data for desired reference
-        bool IsDataLoaded(const int& refId) const;
-        // clears index data from all references except the one specified
-        void KeepOnlyReferenceOffsets(const int& refId);
-        // simplifies index by merging 'chunks'
-        void MergeChunks(void);
-        // saves BAM bin entry for index
-        void SaveBinEntry(BamBinMap& binMap,
-                          const uint32_t& saveBin,
-                          const uint64_t& saveOffset,
-                          const uint64_t& lastOffset);
-        // saves linear offset entry for index
-        void SaveLinearOffset(LinearOffsetVector& offsets,
-                              const BamAlignment& bAlignment,
-                              const uint64_t& lastOffset);
-        // initializes index data structure to hold @count references
-        void SetReferenceCount(const int& count);
+        void CloseFile(void);
+        bool IsFileOpen(void) const;
+        bool OpenFile(const std::string& filename, const char* mode);
+        bool Seek(const int64_t& position, const int& origin);
+        int64_t Tell(void) const;
  
-    // data members
+    // internal BAI index building methods
+    private:
+        void ClearReferenceEntry(BaiReferenceEntry& refEntry);
+        void SaveAlignmentChunkToBin(BaiBinMap& binMap,
+                                     const uint32_t& currentBin,
+                                     const uint64_t& currentOffset,
+                                     const uint64_t& lastOffset);
+        void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets,
+                                   const int& alignmentStartPosition,
+                                   const int& alignmentStopPosition,
+                                   const uint64_t& lastOffset);
+
+    // internal random-access methods
+    private:
+        bool AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end);
+        void CalculateCandidateBins(const uint32_t& begin,
+                                    const uint32_t& end,
+                                    std::set<uint16_t>& candidateBins);
+        bool CalculateCandidateOffsets(const BaiReferenceSummary& refSummary,
+                                       const uint64_t& minOffset,
+                                       std::set<uint16_t>& candidateBins,
+                                       std::vector<int64_t>& offsets);
+        uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin);
+        bool GetOffsets(const BamRegion& region, std::vector<int64_t>& offsets);
+        uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index);
+
+    // internal BAI summary (create/load) methods
+    private:
+        void ReserveForSummary(const int& numReferences);
+        void SaveBinsSummary(const int& refId, const int& numBins);
+        void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets);
+        bool SkipBins(const int& numBins);
+        bool SkipLinearOffsets(const int& numLinearOffsets);
+        bool SummarizeBins(BaiReferenceSummary& refSummary);
+        bool SummarizeIndexFile(void);
+        bool SummarizeLinearOffsets(BaiReferenceSummary& refSummary);
+        bool SummarizeReference(BaiReferenceSummary& refSummary);
+
+    // internal BAI full index input methods
+    private:
+        bool ReadBinID(uint32_t& binId);
+        bool ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks);
+        bool ReadIntoBuffer(const unsigned int& bytesRequested);
+        bool ReadLinearOffset(uint64_t& linearOffset);
+        bool ReadNumAlignmentChunks(int& numAlignmentChunks);
+        bool ReadNumBins(int& numBins);
+        bool ReadNumLinearOffsets(int& numLinearOffsets);
+        bool ReadNumReferences(int& numReferences);
+
+    // internal BAI full index output methods
      private:
+        void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks);
+        void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets);
+        bool WriteAlignmentChunk(const BaiAlignmentChunk& chunk);
+        bool WriteAlignmentChunks(BaiAlignmentChunkVector& chunks);
+        bool WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks);
+        bool WriteBins(const int& refId, BaiBinMap& bins);
+        bool WriteHeader(void);
+        bool WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets);
+        bool WriteReferenceEntry(BaiReferenceEntry& refEntry);
  
-        BamStandardIndexData m_indexData;
-        off_t m_dataBeginOffset;
-        bool  m_hasFullDataCache;
+    // data members
+    private:
+        FILE* m_indexStream;
          bool  m_isBigEndian;
+        BamIndex::IndexCacheMode m_cacheMode;
+        BaiFileSummary m_indexFileSummary;
+
+        // our input buffer
+        char* m_buffer;
+        unsigned int m_bufferLength;
+
+    // static methods
+    private:
+        // checks if the buffer is large enough to accomodate the requested size
+        static void CheckBufferSize(char*& buffer,
+                                    unsigned int& bufferLength,
+                                    const unsigned int& requestedBytes);
+        // checks if the buffer is large enough to accomodate the requested size
+        static void CheckBufferSize(unsigned char*& buffer,
+                                    unsigned int& bufferLength,
+                                    const unsigned int& requestedBytes);
+    // static constants
+    private:
+        static const int MAX_BIN;
+        static const int BAM_LIDX_SHIFT;
+        static const std::string BAI_EXTENSION;
+        static const char* const BAI_MAGIC;
+        static const int SIZEOF_ALIGNMENTCHUNK;
+        static const int SIZEOF_BINCORE;
+        static const int SIZEOF_LINEAROFFSET;
  };
  
  } // namespace Internal
diff --git a/src/api/internal/BamToolsIndex_p.cpp b/src/api/internal/BamToolsIndex_p.cpp

index 954400eaebe421ae6794af91777b1e4e6608bfd8..7e6e4cacbe536b51f83fdda19ec404f285c265a4 100644 (file)
--- a/src/api/internal/BamToolsIndex_p.cpp
+++ b/src/api/internal/BamToolsIndex_p.cpp
@@ -3,13 +3,12 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 19 January 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides index operations for the BamTools index format (".bti")
  // ***************************************************************************
  
  #include <api/BamAlignment.h>
-#include <api/BamReader.h>
  #include <api/internal/BamReader_p.h>
  #include <api/internal/BamToolsIndex_p.h>
  #include <api/internal/BgzfStream_p.h>
@@ -24,11 +23,18 @@ using namespace BamTools::Internal;
  #include <map>
  using namespace std;
  
-BamToolsIndex::BamToolsIndex(void)
-    : BamIndex()
-    , m_blockSize(1000)
-    , m_dataBeginOffset(0)
-    , m_hasFullDataCache(false)
+// static BamToolsIndex constants
+const int BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
+const string BamToolsIndex::BTI_EXTENSION     = ".bti";
+const char* const BamToolsIndex::BTI_MAGIC    = "BTI\1";
+const int BamToolsIndex::SIZEOF_BLOCK         = sizeof(int32_t)*2 + sizeof(int64_t);
+
+// ctor
+BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
+    : BamIndex(reader)
+    , m_indexStream(0)
+    , m_cacheMode(BamIndex::LimitedIndexCaching)
+    , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
      , m_inputVersion(0)
      , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
  {
@@ -37,33 +43,123 @@ BamToolsIndex::BamToolsIndex(void)
  
  // dtor
  BamToolsIndex::~BamToolsIndex(void) {
-    ClearAllData();
+    CloseFile();
  }
  
-// creates index data (in-memory) from @reader data
-bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) {
+bool BamToolsIndex::CheckMagicNumber(void) {
+
+    // check 'magic number' to see if file is BTI index
+    char magic[4];
+    size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream);
+    if ( elementsRead != 4 ) {
+        cerr << "BamToolsIndex ERROR: could not read format 'magic' number" << endl;
+        return false;
+    }
+
+    if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) {
+        cerr << "BamToolsIndex ERROR: invalid format" << endl;
+        return false;
+    }
+
+    // otherwise ok
+    return true;
+}
+
+// check index file version, return true if OK
+bool BamToolsIndex::CheckVersion(void) {
+
+    // read version from file
+    size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
+    if ( elementsRead != 1 ) return false;
+    if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
+
+    // if version is negative, or zero
+    if ( m_inputVersion <= 0 ) {
+        cerr << "BamToolsIndex ERROR: could not load index file: invalid version."
+             << endl;
+        return false;
+    }
+
+    // if version is newer than can be supported by this version of bamtools
+    else if ( m_inputVersion > m_outputVersion ) {
+        cerr << "BamToolsIndex ERROR: could not load index file. This version of BamTools does not recognize new index file version"
+             << endl
+             << "Please update BamTools to a more recent version to support this index file."
+             << endl;
+        return false;
+    }
+
+    // ------------------------------------------------------------------
+    // check for deprecated, unsupported versions
+    // (typically whose format did not accomodate a particular bug fix)
  
-    // skip if invalid reader
-    if ( reader == 0 )
+    else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) {
+        cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to accessing data near reference ends."
+             << endl << endl
+             << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file."
+             << endl << endl;
          return false;
+    }
  
-    // skip if reader's BgzfStream is invalid or not open
-    BgzfStream* bgzfStream = reader->Stream();
-    if ( bgzfStream == 0 || !bgzfStream->IsOpen )
+    else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) {
+        cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to handling empty references."
+             << endl << endl
+             << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file."
+             << endl << endl;
          return false;
+    }
+
+    // otherwise ok
+    else return true;
+}
+
+void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
+    refEntry.ID = -1;
+    refEntry.Blocks.clear();
+}
+
+void BamToolsIndex::CloseFile(void) {
+    if ( IsFileOpen() )
+        fclose(m_indexStream);
+    m_indexFileSummary.clear();
+}
+
+// builds index from associated BAM file & writes out to index file
+bool BamToolsIndex::Create(void) {
+
+    // return false if BamReader is invalid or not open
+    if ( m_reader == 0 || !m_reader->IsOpen() ) {
+        cerr << "BamToolsIndex ERROR: BamReader is not open"
+             << ", aborting index creation" << endl;
+        return false;
+    }
+
+    // rewind BamReader
+    if ( !m_reader->Rewind() ) {
+        cerr << "BamToolsIndex ERROR: could not rewind BamReader to create index"
+             << ", aborting index creation" << endl;
+        return false;
+    }
+
+    // open new index file (read & write)
+    string indexFilename = m_reader->Filename() + Extension();
+    if ( !OpenFile(indexFilename, "w+b") ) {
+        cerr << "BamStandardIndex ERROR: could not open ouput index file " << indexFilename
+             << ", aborting index creation" << endl;
+        return false;
+    }
  
-    // move reader's file pointer to beginning of alignments
-    if ( !reader->Rewind() ) return false;
+    // initialize BtiFileSummary with number of references
+    const int& numReferences = m_reader->GetReferenceCount();
+    InitializeFileSummary(numReferences);
  
-    // initialize index data structure with space for all references
-    const int numReferences = reader->GetReferenceCount();
-    m_indexData.clear();
-    m_hasFullDataCache = false;
-    SetReferenceCount(numReferences);
+    // initialize output file
+    bool createdOk = true;
+    createdOk &= WriteHeader();
  
-    // set up counters and markers
+    // index building markers
      int32_t currentBlockCount      = 0;
-    int64_t currentAlignmentOffset = bgzfStream->Tell();
+    int64_t currentAlignmentOffset = m_reader->Tell();
      int32_t blockRefId             = 0;
      int32_t blockMaxEndPosition    = 0;
      int64_t blockStartOffset       = currentAlignmentOffset;
@@ -71,16 +167,21 @@ bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) {
  
      // plow through alignments, storing index entries
      BamAlignment al;
-    while ( reader->LoadNextAlignment(al) ) {
+    BtiReferenceEntry refEntry;
+    while ( m_reader->LoadNextAlignment(al) ) {
  
          // if block contains data (not the first time through) AND alignment is on a new reference
          if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
  
-            // store previous data
-            BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
-            SaveOffsetEntry(blockRefId, entry);
+            // store previous BTI block data in reference entry
+            BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+            refEntry.Blocks.push_back(block);
  
-            // intialize new block for current alignment's reference
+            // write reference entry, then clear
+            createdOk &= WriteReferenceEntry(refEntry);
+            ClearReferenceEntry(refEntry);
+
+            // update markers
              currentBlockCount   = 0;
              blockMaxEndPosition = al.GetEndPosition();
              blockStartOffset    = currentAlignmentOffset;
@@ -102,420 +203,329 @@ bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) {
  
          // if block is full, get offset for next block, reset currentBlockCount
          if ( currentBlockCount == m_blockSize ) {
-            BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
-            SaveOffsetEntry(blockRefId, entry);
-            blockStartOffset  = bgzfStream->Tell();
+
+            // store previous block data in reference entry
+            BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+            refEntry.Blocks.push_back(block);
+
+            // update markers
+            blockStartOffset  = m_reader->Tell();
              currentBlockCount = 0;
          }
  
          // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
          // necessary because we won't know if this next alignment is on a new reference until we actually read it
-        currentAlignmentOffset = bgzfStream->Tell();
-    }
-
-    // store final block with data
-    BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
-    SaveOffsetEntry(blockRefId, entry);
-
-    // set flag
-    m_hasFullDataCache = true;
-
-    // return success/failure of rewind
-    return reader->Rewind();
-}
-
-// check index file magic number, return true if OK
-bool BamToolsIndex::CheckMagicNumber(void) {
-
-    // see if index is valid BAM index
-    char magic[4];
-    size_t elementsRead = fread(magic, 1, 4, m_indexStream);
-    if ( elementsRead != 4 ) return false;
-    if ( strncmp(magic, "BTI\1", 4) != 0 ) {
-        fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid magic number.\n");
-        return false;
-    }
-
-    // otherwise ok
-    return true;
-}
-
-// check index file version, return true if OK
-bool BamToolsIndex::CheckVersion(void) {
-
-    // read version from file
-    size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
-    if ( elementsRead != 1 ) return false;
-    if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
-
-    // if version is negative, or zero
-    if ( m_inputVersion <= 0 ) {
-        fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid version.\n");
-        return false;
+        currentAlignmentOffset = m_reader->Tell();
      }
  
-    // if version is newer than can be supported by this version of bamtools
-    else if ( m_inputVersion > m_outputVersion ) {
-        fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of BamTools does not recognize new index file version.\n");
-        fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
-        return false;
-    }
+    // store last BTI block data in reference entry
+    BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+    refEntry.Blocks.push_back(block);
  
-    // ------------------------------------------------------------------
-    // check for deprecated, unsupported versions
-    // (typically whose format did not accomodate a particular bug fix)
+    // write last reference entry, then clear
+    createdOk &= WriteReferenceEntry(refEntry);
+    ClearReferenceEntry(refEntry);
  
-    else if ( (Version)m_inputVersion == BTI_1_0 ) {
-        fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to accessing data near reference ends.\n");
-        fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n");
-        return false;
-    }
+    // rewind reader & return result
+    createdOk &= m_reader->Rewind();
  
-    else if ( (Version)m_inputVersion == BTI_1_1 ) {
-        fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to handling empty references.\n");
-        fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n");
-        return false;
-    }
-
-    // otherwise ok
-    else return true;
+    // return result
+    return createdOk;
  }
  
-// clear all current index offset data in memory
-void BamToolsIndex::ClearAllData(void) {
-    BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
-    BamToolsIndexData::const_iterator indexEnd  = m_indexData.end();
-    for ( ; indexIter != indexEnd; ++indexIter ) {
-        const int& refId = (*indexIter).first;
-        ClearReferenceOffsets(refId);
-    }
+// returns format's file extension
+const std::string BamToolsIndex::Extension(void) {
+    return BamToolsIndex::BTI_EXTENSION;
  }
  
-// clear all index offset data for desired reference
-void BamToolsIndex::ClearReferenceOffsets(const int& refId) {
-    if ( m_indexData.find(refId) == m_indexData.end() ) return;
-    vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
-    offsets.clear();
-    m_hasFullDataCache = false;
-}
-
-// return file position after header metadata
-off_t BamToolsIndex::DataBeginOffset(void) const {
-    return m_dataBeginOffset;
-}
-
-// calculate BAM file offset for desired region
-// return true if no error (*NOT* equivalent to "has alignments or valid offset")
-//   check @hasAlignmentsInRegion to determine this status
-// @region - target region
-// @offset - resulting seek target
-// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
-// N.B. - ignores isRightBoundSpecified
  bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
  
-    // return false if leftBound refID is not found in index data
-    BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
-    if ( indexIter == m_indexData.end() )
-        return false;
-
-    // load index data for region if not already cached
-    if ( !IsDataLoaded(region.LeftRefID) ) {
-
-        bool loadedOk = true;
-        loadedOk &= SkipToReference(region.LeftRefID);
-        loadedOk &= LoadReference(region.LeftRefID);
-        if ( !loadedOk ) return false;
-    }
-
-    // localize index data for this reference (& sanity check that data actually exists)
-    indexIter = m_indexData.find(region.LeftRefID);
-    if ( indexIter == m_indexData.end() )
+    // return false ref ID is not a valid index in file summary data
+    if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
          return false;
  
-    const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
-    if ( referenceOffsets.empty() )
+    // retrieve reference index data for left bound reference
+    BtiReferenceEntry refEntry(region.LeftRefID);
+    if ( !ReadReferenceEntry(refEntry) ) {
+        cerr << "BamToolsIndex ERROR: could not retrieve index data from BTI file" << endl;
          return false;
+    }
  
-    // -------------------------------------------------------
-    // calculate nearest index to jump to
-
-    // save first offset
-    offset = (*referenceOffsets.begin()).StartOffset;
+    // TODO: can this next loop be sped up?
+    // Binary search for overlap instead of linear search perhaps.
  
-    // iterate over offsets entries on this reference
-    vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
-    vector<BamToolsIndexEntry>::const_iterator offsetEnd  = referenceOffsets.end();
-    for ( ; offsetIter != offsetEnd; ++offsetIter ) {
-        const BamToolsIndexEntry& entry = (*offsetIter);
+    // iterate over blocks on this reference
+    BtiBlockVector::const_iterator blockIter = refEntry.Blocks.begin();
+    BtiBlockVector::const_iterator blockEnd  = refEntry.Blocks.end();
+    for ( ; blockIter != blockEnd; ++blockIter ) {
+        const BtiBlock& block = (*blockIter);
  
-        // break if alignment 'entry' overlaps region
-        if ( entry.MaxEndPosition >= region.LeftPosition ) break;
-        offset = (*offsetIter).StartOffset;
+        // store offset & break if block end overlaps region start
+        if ( block.MaxEndPosition >= region.LeftPosition ) {
+            offset = block.StartOffset;
+            break;
+        }
      }
  
-    // set flag based on whether an index entry was found for this region
-    *hasAlignmentsInRegion = ( offsetIter != offsetEnd );
-
-    // if cache mode set to none, dump the data we just loaded
-    if ( m_cacheMode == BamIndex::NoIndexCaching )
-        ClearReferenceOffsets(region.LeftRefID);
+    // sets to false if blocks container is empty, or if no matching block could be found
+    *hasAlignmentsInRegion = ( blockIter != blockEnd );
  
      // return success
      return true;
  }
  
  // returns whether reference has alignments or no
-bool BamToolsIndex::HasAlignments(const int& refId) const {
-    BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
-    if ( indexIter == m_indexData.end()) return false;
-    const BamToolsReferenceEntry& refEntry = (*indexIter).second;
-    return refEntry.HasAlignments;
+bool BamToolsIndex::HasAlignments(const int& referenceID) const {
+    if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
+        return false;
+    const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
+    return ( refSummary.NumBlocks > 0 );
  }
  
-// return true if all index data is cached
-bool BamToolsIndex::HasFullDataCache(void) const {
-    return m_hasFullDataCache;
+void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
+    m_indexFileSummary.clear();
+    for ( int i = 0; i < numReferences; ++i )
+        m_indexFileSummary.push_back( BtiReferenceSummary() );
  }
  
-// returns true if index cache has data for desired reference
-bool BamToolsIndex::IsDataLoaded(const int& refId) const {
-
-    BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
-    if ( indexIter == m_indexData.end()) return false;
-    const BamToolsReferenceEntry& refEntry = (*indexIter).second;
-
-    if ( !refEntry.HasAlignments ) return true; // no data period
-
-    // return whether offsets list contains data
-    return !refEntry.Offsets.empty();
+bool BamToolsIndex::IsFileOpen(void) const {
+    return ( m_indexStream != 0 );
  }
  
-// attempts to use index to jump to @region in @reader; returns success/fail
-bool BamToolsIndex::Jump(Internal::BamReaderPrivate* reader,
-                         const BamTools::BamRegion& region,
-                         bool* hasAlignmentsInRegion)
-{
+// attempts to use index data to jump to @region, returns success/fail
+// a "successful" jump indicates no error, but not whether this region has data
+//   * thus, the method sets a flag to indicate whether there are alignments
+//     available after the jump position
+bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
+
      // clear flag
      *hasAlignmentsInRegion = false;
  
-    // skip if invalid reader
-    if ( reader == 0 ) return false;
-
-    // skip if reader's BgzfStream is invalid or not open
-    BgzfStream* bgzfStream = reader->Stream();
-    if ( bgzfStream == 0 || !bgzfStream->IsOpen )
+    // skip if invalid reader or not open
+    if ( m_reader == 0 || !m_reader->IsOpen() )
          return false;
  
      // make sure left-bound position is valid
-    const RefVector& references = reader->GetReferenceData();
+    const RefVector& references = m_reader->GetReferenceData();
      if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
          return false;
  
      // calculate nearest offset to jump to
      int64_t offset;
      if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
-        fprintf(stderr, "BamToolsIndex ERROR: could not jump - unable to calculate offset for specified region.\n");
+        cerr << "BamToolsIndex ERROR: could not jump"
+             << ", unable to calculate offset for specified region" << endl;
          return false;
      }
  
      // return success/failure of seek
-    return bgzfStream->Seek(offset);
+    return m_reader->Seek(offset);
  }
  
-// clears index data from all references except the first
-void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) {
-    BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
-    KeepOnlyReferenceOffsets( (*indexBegin).first );
-}
+// loads existing data from file into memory
+bool BamToolsIndex::Load(const std::string& filename) {
  
-// clears index data from all references except the one specified
-void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) {
-    BamToolsIndexData::iterator mapIter = m_indexData.begin();
-    BamToolsIndexData::iterator mapEnd  = m_indexData.end();
-    for ( ; mapIter != mapEnd; ++mapIter ) {
-        const int entryRefId = (*mapIter).first;
-        if ( entryRefId != refId )
-            ClearReferenceOffsets(entryRefId);
+    // attempt open index file (read-only)
+    if ( !OpenFile(filename, "rb") ) {
+        cerr << "BamStandardIndex ERROR: could not open input index file " << filename
+             << ", aborting index load" << endl;
+        return false;
      }
+
+    // attempt to load & validate BTI header data
+    if ( !LoadHeader() ) {
+        CloseFile();
+        return false;
+    }
+
+    // attempt to load index file summary, return success/failure
+    return LoadFileSummary();
  }
  
-// load index data for all references, return true if loaded OK
-bool BamToolsIndex::LoadAllReferences(bool saveData) {
+bool BamToolsIndex::LoadFileSummary(void) {
  
-    // skip if data already loaded
-    if ( m_hasFullDataCache ) return true;
+    // load number of reference sequences
+    int numReferences;
+    if ( !LoadNumReferences(numReferences) )
+        return false;
  
-    // read in number of references
-    int32_t numReferences;
-    if ( !LoadReferenceCount(numReferences) ) return false;
-    //SetReferenceCount(numReferences);
+    // initialize file summary data
+    InitializeFileSummary(numReferences);
  
      // iterate over reference entries
      bool loadedOk = true;
-    for ( int i = 0; i < numReferences; ++i )
-        loadedOk &= LoadReference(i, saveData);
-
-    // set flag
-    if ( loadedOk && saveData )
-        m_hasFullDataCache = true;
+    BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
+    BtiFileSummary::iterator summaryEnd  = m_indexFileSummary.end();
+    for ( ; summaryIter != summaryEnd; ++summaryIter )
+        loadedOk &= LoadReferenceSummary(*summaryIter);
  
-    // return success/failure of load
+    // return result
      return loadedOk;
  }
  
-// load header data from index file, return true if loaded OK
  bool BamToolsIndex::LoadHeader(void) {
  
-    // check magic number
-    if ( !CheckMagicNumber() ) return false;
+    // if invalid format 'magic number'
+    if ( !CheckMagicNumber() )
+        return false;
  
-    // check BTI version
-    if ( !CheckVersion() ) return false;
+    // if invalid BTI version
+    if ( !CheckVersion() )
+        return false;
  
-    // read in block size
+    // use file's BTI block size to set member variable
      size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
-    if ( elementsRead != 1 ) return false;
-
-    // swap endian-ness if necessary
      if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
-
-    // store offset of beginning of data
-    m_dataBeginOffset = ftell64(m_indexStream);
-
-    // return success
-    return true;
+    return ( elementsRead == 1 );
  }
  
-// load a single index entry from file, return true if loaded OK
-// @saveData - save data in memory if true, just read & discard if false
-bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) {
+bool BamToolsIndex::LoadNumBlocks(int& numBlocks) {
+    size_t elementsRead = 0;
+    elementsRead += fread(&numBlocks, sizeof(numBlocks), 1, m_indexStream);
+    if ( m_isBigEndian ) SwapEndian_32(numBlocks);
+    return ( elementsRead == 1 );
+}
  
-    // read in index entry data members
+bool BamToolsIndex::LoadNumReferences(int& numReferences) {
      size_t elementsRead = 0;
-    BamToolsIndexEntry entry;
-    elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream);
-    elementsRead += fread(&entry.StartOffset,    sizeof(entry.StartOffset),    1, m_indexStream);
-    elementsRead += fread(&entry.StartPosition,  sizeof(entry.StartPosition),  1, m_indexStream);
-    if ( elementsRead != 3 ) return false;
+    elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+    if ( m_isBigEndian ) SwapEndian_32(numReferences);
+    return ( elementsRead == 1 );
+}
  
-    // swap endian-ness if necessary
-    if ( m_isBigEndian ) {
-        SwapEndian_32(entry.MaxEndPosition);
-        SwapEndian_64(entry.StartOffset);
-        SwapEndian_32(entry.StartPosition);
-    }
+bool BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
  
-    // save data
-    if ( saveData )
-        SaveOffsetEntry(refId, entry);
+    // load number of blocks
+    int numBlocks;
+    if ( !LoadNumBlocks(numBlocks) )
+        return false;
  
-    // return success/failure of load
-    return true;
+    // store block summary data for this reference
+    refSummary.NumBlocks = numBlocks;
+    refSummary.FirstBlockFilePosition = Tell();
+
+    // skip blocks in index file (and return status)
+    return SkipBlocks(numBlocks);
  }
  
-// load a single reference from file, return true if loaded OK
-// @saveData - save data in memory if true, just read & discard if false
-bool BamToolsIndex::LoadFirstReference(bool saveData) {
-    BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
-    return LoadReference( (*indexBegin).first, saveData );
+bool BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
+
+    // make sure any previous index file is closed
+    CloseFile();
+
+    // attempt to open file
+    m_indexStream = fopen(filename.c_str(), mode);
+    return IsFileOpen();
  }
  
-// load a single reference from file, return true if loaded OK
-// @saveData - save data in memory if true, just read & discard if false
-bool BamToolsIndex::LoadReference(const int& refId, bool saveData) {
+bool BamToolsIndex::ReadBlock(BtiBlock& block) {
  
-    // read in number of offsets for this reference
-    uint32_t numOffsets;
-    size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
-    if ( elementsRead != 1 ) return false;
+    // read in block data members
+    size_t elementsRead = 0;
+    elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, m_indexStream);
+    elementsRead += fread(&block.StartOffset,    sizeof(block.StartOffset),    1, m_indexStream);
+    elementsRead += fread(&block.StartPosition,  sizeof(block.StartPosition),  1, m_indexStream);
  
      // swap endian-ness if necessary
-    if ( m_isBigEndian ) SwapEndian_32(numOffsets);
+    if ( m_isBigEndian ) {
+        SwapEndian_32(block.MaxEndPosition);
+        SwapEndian_64(block.StartOffset);
+        SwapEndian_32(block.StartPosition);
+    }
  
-    // initialize offsets container for this reference
-    SetOffsetCount(refId, (int)numOffsets);
+    // return success/failure
+    return ( elementsRead == 3 );
+}
  
-    // iterate over offset entries
-    for ( unsigned int j = 0; j < numOffsets; ++j )
-        LoadIndexEntry(refId, saveData);
+bool BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
  
-    // return success/failure of load
-    return true;
-}
+    // prep blocks container
+    blocks.clear();
+    blocks.reserve(refSummary.NumBlocks);
  
-// loads number of references, return true if loaded OK
-bool BamToolsIndex::LoadReferenceCount(int& numReferences) {
+    // skip to first block entry
+    if ( !Seek( refSummary.FirstBlockFilePosition, SEEK_SET ) ) {
+        cerr << "BamToolsIndex ERROR: could not seek to position "
+             << refSummary.FirstBlockFilePosition << endl;
+        return false;
+    }
  
-    size_t elementsRead = 0;
+    // read & store block entries
+    bool readOk = true;
+    BtiBlock block;
+    for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
+        readOk &= ReadBlock(block);
+        blocks.push_back(block);
+    }
+    return readOk;
+}
  
-    // read reference count
-    elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
-    if ( elementsRead != 1 ) return false;
+bool BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
  
-    // swap endian-ness if necessary
-    if ( m_isBigEndian ) SwapEndian_32(numReferences);
+    // return false if refId not valid index in file summary structure
+    if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
+        return false;
  
-    // return success
-    return true;
+    // use index summary to assist reading the reference's BTI blocks
+    const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
+    return ReadBlocks(refSummary, refEntry.Blocks);
  }
  
-// saves an index offset entry in memory
-void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
-    BamToolsReferenceEntry& refEntry = m_indexData[refId];
-    refEntry.HasAlignments = true;
-    refEntry.Offsets.push_back(entry);
+bool BamToolsIndex::Seek(const int64_t& position, const int& origin) {
+    return ( fseek64(m_indexStream, position, origin) == 0 );
  }
  
-// pre-allocates size for offset vector
-void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) {
-    BamToolsReferenceEntry& refEntry = m_indexData[refId];
-    refEntry.Offsets.reserve(offsetCount);
-    refEntry.HasAlignments = ( offsetCount > 0);
+// change the index caching behavior
+void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) {
+    m_cacheMode = mode;
+    // do nothing else here ? cache mode will be ignored from now on, most likely
  }
  
-// initializes index data structure to hold @count references
-void BamToolsIndex::SetReferenceCount(const int& count) {
-    for ( int i = 0; i < count; ++i )
-        m_indexData[i].HasAlignments = false;
+bool BamToolsIndex::SkipBlocks(const int& numBlocks) {
+    return Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
  }
  
-// position file pointer to first reference begin, return true if skipped OK
-bool BamToolsIndex::SkipToFirstReference(void) {
-    BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
-    return SkipToReference( (*indexBegin).first );
+int64_t BamToolsIndex::Tell(void) const {
+    return ftell64(m_indexStream);
  }
  
-// position file pointer to desired reference begin, return true if skipped OK
-bool BamToolsIndex::SkipToReference(const int& refId) {
-
-    // attempt rewind
-    if ( !Rewind() ) return false;
+bool BamToolsIndex::WriteBlock(const BtiBlock& block) {
  
-    // read in number of references
-    int32_t numReferences;
-    size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
-    if ( elementsRead != 1 ) return false;
-    if ( m_isBigEndian ) SwapEndian_32(numReferences);
+    // copy entry data
+    int32_t maxEndPosition = block.MaxEndPosition;
+    int64_t startOffset    = block.StartOffset;
+    int32_t startPosition  = block.StartPosition;
  
-    // iterate over reference entries
-    bool skippedOk = true;
-    int currentRefId = 0;
-    while (currentRefId != refId) {
-        skippedOk &= LoadReference(currentRefId, false);
-        ++currentRefId;
+    // swap endian-ness if necessary
+    if ( m_isBigEndian ) {
+        SwapEndian_32(maxEndPosition);
+        SwapEndian_64(startOffset);
+        SwapEndian_32(startPosition);
      }
  
-    // return success/failure of skip
-    return skippedOk;
+    // write the reference index entry
+    size_t elementsWritten = 0;
+    elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
+    elementsWritten += fwrite(&startOffset,    sizeof(startOffset),    1, m_indexStream);
+    elementsWritten += fwrite(&startPosition,  sizeof(startPosition),  1, m_indexStream);
+    return ( elementsWritten == 3 );
+}
+
+bool BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
+    bool writtenOk = true;
+    BtiBlockVector::const_iterator blockIter = blocks.begin();
+    BtiBlockVector::const_iterator blockEnd  = blocks.end();
+    for ( ; blockIter != blockEnd; ++blockIter )
+        writtenOk &= WriteBlock(*blockIter);
+    return writtenOk;
  }
  
-// write header to new index file
  bool BamToolsIndex::WriteHeader(void) {
  
      size_t elementsWritten = 0;
  
      // write BTI index format 'magic number'
-    elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream);
+    elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, m_indexStream);
  
      // write BTI index format version
      int32_t currentVersion = (int32_t)m_outputVersion;
@@ -527,76 +537,28 @@ bool BamToolsIndex::WriteHeader(void) {
      if ( m_isBigEndian ) SwapEndian_32(blockSize);
      elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);
  
-    // store offset of beginning of data
-    m_dataBeginOffset = ftell64(m_indexStream);
-
-    // return success/failure of write
-    return ( elementsWritten == 6 );
-}
-
-// write index data for all references to new index file
-bool BamToolsIndex::WriteAllReferences(void) {
-
-    size_t elementsWritten = 0;
-
      // write number of references
-    int32_t numReferences = (int32_t)m_indexData.size();
+    int32_t numReferences = m_indexFileSummary.size();
+    cerr << "Writing numReferences = " << numReferences << endl;
      if ( m_isBigEndian ) SwapEndian_32(numReferences);
      elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
-    if ( elementsWritten != 1 ) return false;
-
-    // iterate through references in index
-    bool refOk = true;
-    BamToolsIndexData::const_iterator refIter = m_indexData.begin();
-    BamToolsIndexData::const_iterator refEnd  = m_indexData.end();
-    for ( ; refIter != refEnd; ++refIter )
-        refOk &= WriteReferenceEntry( (*refIter).second );
  
-    // return success/fail
-    return refOk;
+    // return success/failure of write
+    return ( elementsWritten == 7 );
  }
  
-// write current reference index data to new index file
-bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
+bool BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
  
      size_t elementsWritten = 0;
  
-    // write number of offsets listed for this reference
-    uint32_t numOffsets = refEntry.Offsets.size();
-    if ( m_isBigEndian ) SwapEndian_32(numOffsets);
-    elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
-    if ( elementsWritten != 1 ) return false;
+    // write number of blocks this reference
+    uint32_t numBlocks = refEntry.Blocks.size();
+    if ( m_isBigEndian ) SwapEndian_32(numBlocks);
+    elementsWritten += fwrite(&numBlocks, sizeof(numBlocks), 1, m_indexStream);
  
-    // iterate over offset entries
-    bool entriesOk = true;
-    vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
-    vector<BamToolsIndexEntry>::const_iterator offsetEnd  = refEntry.Offsets.end();
-    for ( ; offsetIter != offsetEnd; ++offsetIter )
-        entriesOk &= WriteIndexEntry( (*offsetIter) );
+    // write actual block entries
+    const bool blocksOk = WriteBlocks(refEntry.Blocks);
  
      // return success/fail
-    return entriesOk;
-}
-
-// write current index offset entry to new index file
-bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) {
-
-    // copy entry data
-    int32_t maxEndPosition = entry.MaxEndPosition;
-    int64_t startOffset    = entry.StartOffset;
-    int32_t startPosition  = entry.StartPosition;
-
-    // swap endian-ness if necessary
-    if ( m_isBigEndian ) {
-        SwapEndian_32(maxEndPosition);
-        SwapEndian_64(startOffset);
-        SwapEndian_32(startPosition);
-    }
-
-    // write the reference index entry
-    size_t elementsWritten = 0;
-    elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
-    elementsWritten += fwrite(&startOffset,    sizeof(startOffset),    1, m_indexStream);
-    elementsWritten += fwrite(&startPosition,  sizeof(startPosition),  1, m_indexStream);
-    return ( elementsWritten == 3 );
+    return ( elementsWritten == 1) && blocksOk;
  }
diff --git a/src/api/internal/BamToolsIndex_p.h b/src/api/internal/BamToolsIndex_p.h

index ee5abbc8ad730bd86bab43cfd01d8a12afb9e095..16aef8c9afd85e23ea1b946d0009b9274034e5e0 100644 (file)
--- a/src/api/internal/BamToolsIndex_p.h
+++ b/src/api/internal/BamToolsIndex_p.h
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 19 January 2011 (DB)
+// Last modified: 5 April 2011 (DB)
  // ---------------------------------------------------------------------------
  // Provides index operations for the BamTools index format (".bti")
  // ***************************************************************************
@@ -30,11 +30,8 @@
  namespace BamTools {
  namespace Internal {
  
-// BTI constants
-const std::string BTI_EXTENSION = ".bti";
-
-// individual index offset entry
-struct BamToolsIndexEntry {
+// contains data for each 'block' in a BTI index
+struct BtiBlock {
  
      // data members
      int32_t MaxEndPosition;
@@ -42,30 +39,48 @@ struct BamToolsIndexEntry {
      int32_t StartPosition;
  
      // ctor
-    BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
-                       const int64_t& startOffset    = 0,
-                       const int32_t& startPosition  = 0)
+    BtiBlock(const int32_t& maxEndPosition = 0,
+             const int64_t& startOffset    = 0,
+             const int32_t& startPosition  = 0)
          : MaxEndPosition(maxEndPosition)
          , StartOffset(startOffset)
          , StartPosition(startPosition)
      { }
  };
  
-// reference index entry
-struct BamToolsReferenceEntry {
+// convenience typedef for describing a a list of BTI blocks on a reference
+typedef std::vector<BtiBlock> BtiBlockVector;
+
+// contains all fields necessary for building, loading, & writing
+// full BTI index data for a single reference
+struct BtiReferenceEntry {
  
      // data members
-    bool HasAlignments;
-    std::vector<BamToolsIndexEntry> Offsets;
+    int32_t ID;
+    BtiBlockVector Blocks;
  
      // ctor
-    BamToolsReferenceEntry(void)
-        : HasAlignments(false)
+    BtiReferenceEntry(const int& id = -1)
+        : ID(id)
      { }
  };
  
-// the actual index data structure
-typedef std::map<int, BamToolsReferenceEntry> BamToolsIndexData;
+// provides (persistent) summary of BtiReferenceEntry's index data
+struct BtiReferenceSummary {
+
+    // data members
+    int NumBlocks;
+    uint64_t FirstBlockFilePosition;
+
+    // ctor
+    BtiReferenceSummary(void)
+        : NumBlocks(0)
+        , FirstBlockFilePosition(0)
+    { }
+};
+
+// convenience typedef for describing a full BTI index file summary
+typedef std::vector<BtiReferenceSummary> BtiFileSummary;
  
  class BamToolsIndex : public BamIndex {
  
@@ -84,109 +99,87 @@ class BamToolsIndex : public BamIndex {
                   , BTI_1_2
                   };
  
-
      // ctor & dtor
      public:
-        BamToolsIndex(void);
+        BamToolsIndex(Internal::BamReaderPrivate* reader);
          ~BamToolsIndex(void);
  
-    // interface (implements BamIndex virtual methods)
+    // BamIndex implementation
      public:
-        // creates index data (in-memory) from @reader data
-        bool Build(Internal::BamReaderPrivate* reader);
-        // returns supported file extension
-        const std::string Extension(void) { return BTI_EXTENSION; }
+        // builds index from associated BAM file & writes out to index file
+        bool Create(void);
          // returns whether reference has alignments or no
          bool HasAlignments(const int& referenceID) const;
-        // attempts to use index to jump to @region in @reader; returns success/fail
+        // attempts to use index data to jump to @region, returns success/fail
          // a "successful" jump indicates no error, but not whether this region has data
          //   * thus, the method sets a flag to indicate whether there are alignments
          //     available after the jump position
-        bool Jump(Internal::BamReaderPrivate* reader,
-                  const BamTools::BamRegion& region,
-                  bool *hasAlignmentsInRegion);
-
-    public:
-        // clear all current index offset data in memory
-        void ClearAllData(void);
-        // return file position after header metadata
-        off_t DataBeginOffset(void) const;
-        // return true if all index data is cached
-        bool HasFullDataCache(void) const;
-        // clears index data from all references except the first
-        void KeepOnlyFirstReferenceOffsets(void);
-        // load index data for all references, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadAllReferences(bool saveData = true);
-        // load first reference from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadFirstReference(bool saveData = true);
-        // load header data from index file, return true if loaded OK
-        bool LoadHeader(void);
-        // position file pointer to first reference begin, return true if skipped OK
-        bool SkipToFirstReference(void);
-        // write index reference data
-        bool WriteAllReferences(void);
-        // write index header data
-        bool WriteHeader(void);
-
-    // internal methods
+        bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+        // loads existing data from file into memory
+        bool Load(const std::string& filename);
+        // change the index caching behavior
+        void SetCacheMode(const BamIndex::IndexCacheMode& mode);
      public:
+        // returns format's file extension
+        static const std::string Extension(void);
  
-        // -----------------------
-        // index file operations
-
-        // check index file magic number, return true if OK
+    // internal file ops
+    private:
          bool CheckMagicNumber(void);
-        // check index file version, return true if OK
          bool CheckVersion(void);
-        // load a single index entry from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadIndexEntry(const int& refId, bool saveData = true);
-        // load a single reference from file, return true if loaded OK
-        // @saveData - save data in memory if true, just read & discard if false
-        bool LoadReference(const int& refId, bool saveData = true);
-        // loads number of references, return true if loaded OK
-        bool LoadReferenceCount(int& numReferences);
-        // position file pointer to desired reference begin, return true if skipped OK
-        bool SkipToReference(const int& refId);
-        // write current reference index data to new index file
-        bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry);
-        // write current index offset entry to new index file
-        bool WriteIndexEntry(const BamToolsIndexEntry& entry);
-
-        // -----------------------
-        // index data operations
-
-        // clear all index offset data for desired reference
-        void ClearReferenceOffsets(const int& refId);
-        // calculate BAM file offset for desired region
-        // return true if no error (*NOT* equivalent to "has alignments or valid offset")
-        //   check @hasAlignmentsInRegion to determine this status
-        // @region - target region
-        // @offset - resulting seek target
-        // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
+        void CloseFile(void);
+        bool IsFileOpen(void) const;
+        bool OpenFile(const std::string& filename, const char* mode);
+        bool Seek(const int64_t& position, const int& origin);
+        int64_t Tell(void) const;
+
+    // internal BTI index building methods
+    private:
+        void ClearReferenceEntry(BtiReferenceEntry& refEntry);
+
+    // internal random-access methods
+    private:
          bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
-        // returns true if index cache has data for desired reference
-        bool IsDataLoaded(const int& refId) const;
-        // clears index data from all references except the one specified
-        void KeepOnlyReferenceOffsets(const int& refId);
-        // saves an index offset entry in memory
-        void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry);
-        // pre-allocates size for offset vector
-        void SetOffsetCount(const int& refId, const int& offsetCount);
-        // initializes index data structure to hold @count references
-        void SetReferenceCount(const int& count);
+
+    // internal BTI summary data methods
+    private:
+        void InitializeFileSummary(const int& numReferences);
+        bool LoadFileSummary(void);
+        bool LoadHeader(void);
+        bool LoadNumBlocks(int& numBlocks);
+        bool LoadNumReferences(int& numReferences);
+        bool LoadReferenceSummary(BtiReferenceSummary& refSummary);
+        bool SkipBlocks(const int& numBlocks);
+
+    // internal BTI full index input methods
+    private:
+        bool ReadBlock(BtiBlock& block);
+        bool ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks);
+        bool ReadReferenceEntry(BtiReferenceEntry& refEntry);
+
+    // internal BTI full index output methods
+    private:
+        bool WriteBlock(const BtiBlock& block);
+        bool WriteBlocks(const BtiBlockVector& blocks);
+        bool WriteHeader(void);
+        bool WriteReferenceEntry(const BtiReferenceEntry& refEntry);
  
      // data members
      private:
-        int32_t           m_blockSize;
-        BamToolsIndexData m_indexData;
-        off_t             m_dataBeginOffset;
-        bool              m_hasFullDataCache;
-        bool              m_isBigEndian;
-        int32_t           m_inputVersion; // Version is serialized as int
-        Version           m_outputVersion;
+        FILE* m_indexStream;
+        bool  m_isBigEndian;
+        BamIndex::IndexCacheMode m_cacheMode;
+        BtiFileSummary m_indexFileSummary;
+        int m_blockSize;
+        int32_t m_inputVersion; // Version is serialized as int
+        Version m_outputVersion;
+
+    // static constants
+    private:
+        static const int DEFAULT_BLOCK_LENGTH;
+        static const std::string BTI_EXTENSION;
+        static const char* const BTI_MAGIC;
+        static const int SIZEOF_BLOCK;
  };
  
  } // namespace Internal
diff --git a/src/api/internal/BgzfStream_p.cpp b/src/api/internal/BgzfStream_p.cpp

index fb67799c9079d7e5f8e7aebe759ebde2a5c8ccfc..aba2a0786a095db16c6adf6c8830a3f14bd37f4a 100644 (file)
--- a/src/api/internal/BgzfStream_p.cpp
+++ b/src/api/internal/BgzfStream_p.cpp
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 21 March 2011(DB)
+// Last modified: 5 April 2011(DB)
  // ---------------------------------------------------------------------------
  // Based on BGZF routines developed at the Broad Institute.
  // Provides the basic functionality for reading & writing BGZF files
@@ -375,7 +375,7 @@ bool BgzfStream::ReadBlock(void) {
  }
  
  // seek to position in BGZF file
-bool BgzfStream::Seek(int64_t position) {
+bool BgzfStream::Seek(const int64_t& position) {
  
      // skip if not open
      if ( !IsOpen ) return false;
@@ -390,12 +390,10 @@ bool BgzfStream::Seek(int64_t position) {
          return false;
      }
  
-    // update block data
+    // update block data & return success
      BlockLength  = 0;
      BlockAddress = blockAddress;
      BlockOffset  = blockOffset;
-
-    // return success
      return true;
  }
  
@@ -404,12 +402,9 @@ void BgzfStream::SetWriteCompressed(bool ok) {
  }
  
  // get file position in BGZF file
-int64_t BgzfStream::Tell(void) {
-
-    // skip if file not open
-    if ( !IsOpen ) return false;
-
-    // otherwise return file pointer position
+int64_t BgzfStream::Tell(void) const {
+    if ( !IsOpen )
+        return 0;
      return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
  }
  
diff --git a/src/api/internal/BgzfStream_p.h b/src/api/internal/BgzfStream_p.h

index 69473e6af01ee7eae01666a264b69159022d43c0..c5e5d4872c0886509be74ca4e3d299f76a678025 100644 (file)
--- a/src/api/internal/BgzfStream_p.h
+++ b/src/api/internal/BgzfStream_p.h
@@ -3,7 +3,7 @@
  // Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
-// Last modified: 24 February 2011(DB)
+// Last modified: 5 April 2011(DB)
  // ---------------------------------------------------------------------------
  // Based on BGZF routines developed at the Broad Institute.
  // Provides the basic functionality for reading & writing BGZF files
@@ -48,11 +48,11 @@ class BgzfStream {
          // reads BGZF data into a byte buffer
          int Read(char* data, const unsigned int dataLength);
          // seek to position in BGZF file
-        bool Seek(int64_t position);
+        bool Seek(const int64_t& position);
          // enable/disable compressed output
          void SetWriteCompressed(bool ok);
          // get file position in BGZF file
-        int64_t Tell(void);
+        int64_t Tell(void) const;
          // writes the supplied data into the BGZF buffer
          unsigned int Write(const char* data, const unsigned int dataLen);
author	derek <derekwbarnett@gmail.com>
	Tue, 5 Apr 2011 16:43:31 +0000 (12:43 -0400)
committer	derek <derekwbarnett@gmail.com>
	Tue, 5 Apr 2011 16:43:31 +0000 (12:43 -0400)
src/api/BamConstants.h		patch \| blob \| history
src/api/BamIndex.cpp	[deleted file]	patch \| blob \| history
src/api/BamIndex.h		patch \| blob \| history
src/api/CMakeLists.txt		patch \| blob \| history
src/api/internal/BamIndexFactory_p.cpp		patch \| blob \| history
src/api/internal/BamIndexFactory_p.h		patch \| blob \| history
src/api/internal/BamMultiReader_p.cpp		patch \| blob \| history
src/api/internal/BamRandomAccessController_p.cpp		patch \| blob \| history
src/api/internal/BamRandomAccessController_p.h		patch \| blob \| history
src/api/internal/BamReader_p.cpp		patch \| blob \| history
src/api/internal/BamReader_p.h		patch \| blob \| history
src/api/internal/BamStandardIndex_p.cpp		patch \| blob \| history
src/api/internal/BamStandardIndex_p.h		patch \| blob \| history
src/api/internal/BamToolsIndex_p.cpp		patch \| blob \| history
src/api/internal/BamToolsIndex_p.h		patch \| blob \| history
src/api/internal/BgzfStream_p.cpp		patch \| blob \| history
src/api/internal/BgzfStream_p.h		patch \| blob \| history