Full update to SVN after combining BamReader and BamWriter into cohesive BamTools...

author barnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>

Wed, 15 Jul 2009 18:02:28 +0000 (18:02 +0000)

committer barnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>

Wed, 15 Jul 2009 18:02:28 +0000 (18:02 +0000)
author barnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Wed, 15 Jul 2009 18:02:28 +0000 (18:02 +0000)
committer barnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Wed, 15 Jul 2009 18:02:28 +0000 (18:02 +0000)
diff --git a/BamAlignment.h b/BamAlignment.h

deleted file mode 100644 (file)

index c69229d..0000000
--- a/BamAlignment.h
+++ /dev/null
@@ -1,176 +0,0 @@
-// BamAlignment.h
-
-// Derek Barnett
-// Marth Lab, Boston College
-// Last modified: 20 March 2009
-
-#ifndef BAMALIGNMENT_H
-#define BAMALIGNMENT_H
-
-#include <string.h>
-#include <stdlib.h>
-
-#ifdef WIN32
-typedef char                 int8_t;
-typedef unsigned char       uint8_t;
-typedef short               int16_t;
-typedef unsigned short     uint16_t;
-typedef int                 int32_t;
-typedef unsigned int       uint32_t;
-typedef long long           int64_t;
-typedef unsigned long long uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-// C++ includes
-#include <string>
-using std::string;
-
-#include <vector>
-using std::vector;
-
-struct CigarOp {
-       uint32_t Length;
-       char     Type;
-};
-
-struct RefData {
-       string       RefName;
-       unsigned int RefLength;
-       bool         RefHasAlignments;
-
-       // constructor
-       RefData(void)
-               : RefLength(0)
-               , RefHasAlignments(false)
-       { }
-};
-
-typedef vector<RefData> RefVector;
-
-struct BamAlignment {
-
-       // queries against alignment flag - see below for further detail
-public:
-       bool IsPaired(void) const            { return ( (AlignmentFlag & PAIRED)        != 0 ); }
-       bool IsProperPair(void) const        { return ( (AlignmentFlag & PROPER_PAIR)   != 0 ); }
-       bool IsMapped(void) const            { return ( (AlignmentFlag & UNMAPPED)      == 0 ); }
-       bool IsMateMapped(void) const        { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
-       bool IsReverseStrand(void) const     { return ( (AlignmentFlag & REVERSE)       != 0 ); }
-       bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }
-       bool IsFirstMate(void) const         { return ( (AlignmentFlag & READ_1)        != 0 ); }
-       bool IsSecondMate(void) const        { return ( (AlignmentFlag & READ_2)        != 0 ); }
-       bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY)     == 0 ); }
-       bool IsFailedQC(void) const          { return ( (AlignmentFlag & QC_FAILED)     != 0 ); }
-       bool IsDuplicate(void) const         { return ( (AlignmentFlag & DUPLICATE)     != 0 ); }
-
-       // returns true and assigns the read group if present in the tag data
-       bool GetReadGroup(string& readGroup) const {
-
-               if(TagData.empty()) return false;
-
-               // localize the tag data
-               char* pTagData = (char*)TagData.data();
-               const unsigned int tagDataLen = TagData.size();
-               unsigned int numBytesParsed = 0;
-
-               bool foundReadGroupTag = false;
-               while(numBytesParsed < tagDataLen) {
-
-                       const char* pTagType = pTagData;
-                       const char* pTagStorageType = pTagData + 2;
-                       pTagData       += 3;
-                       numBytesParsed += 3;
-
-                       // check the current tag
-                       if(strncmp(pTagType, "RG", 2) == 0) {
-                               foundReadGroupTag = true;
-                               break;
-                       }
-
-                       // get the storage class and find the next tag
-                       SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed);
-               }
-
-               // return if the read group tag was not present
-               if(!foundReadGroupTag) return false;
-
-               // assign the read group
-               const unsigned int readGroupLen = strlen(pTagData);
-               readGroup.resize(readGroupLen);
-               memcpy((char*)readGroup.data(), pTagData, readGroupLen);
-               return true;
-       }
-
-       // skips to the next tag
-       static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
-               switch(storageType) {
-                       case 'A':
-                       case 'c':
-                       case 'C':
-                               numBytesParsed++;
-                               pTagData++;
-                               break;
-                       case 's':
-                       case 'S':
-                       case 'f':
-                               numBytesParsed += 2;
-                               pTagData       += 2;
-                               break;
-                       case 'i':
-                       case 'I':
-                               numBytesParsed += 4;
-                               pTagData       += 4;
-                               break;
-                       case 'Z':
-                       case 'H':
-                               while(*pTagData) {
-                                       numBytesParsed++;
-                                       pTagData++;
-                               }
-                               break;
-                       default:
-                               printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
-                               exit(1);
-               }
-       }
-
-       // data members
-public:
-       string       Name;           // read name
-       unsigned int Length;         // query length
-       string       QueryBases;     // original sequence ( produced from machine )
-       string       AlignedBases;   // aligned sequence ( with indels ) 
-       string       Qualities;      // FASTQ qualities ( still in ASCII characters )
-       string       TagData;        // contains the tag data (accessor methods will pull the requested information out)
-       unsigned int RefID;          // ID for reference sequence
-       unsigned int Position;       // position on reference sequence where alignment starts
-       unsigned int Bin;            // bin in BAM file where this alignment resides
-       unsigned int MapQuality;     // mapping quality 
-       unsigned int AlignmentFlag;  // see above for available queries
-       vector<CigarOp> CigarData;   // vector of CIGAR operations (length & type) )
-       unsigned int MateRefID;      // ID for reference sequence that mate was aligned to
-       unsigned int MatePosition;   // position that mate was aligned to
-       unsigned int InsertSize;     // mate pair insert size
-
-       // alignment flag query constants
-private:
-       enum { PAIRED        = 1,               // Alignment comes from paired-end data
-               PROPER_PAIR   = 2,              // Alignment passed paired-end resolution
-               UNMAPPED      = 4,              // Read is unmapped
-               MATE_UNMAPPED = 8,              // Mate is unmapped
-               REVERSE       = 16,             // Read is on reverse strand
-               MATE_REVERSE  = 32,             // Mate is on reverse strand
-               READ_1        = 64,             // This alignment is mate 1 of pair
-               READ_2        = 128,            // This alignment is mate 2 of pair
-               SECONDARY     = 256,            // This alignment is not the primary (best) alignment for read
-               QC_FAILED     = 512,            // Read did not pass prior quality control steps
-               DUPLICATE     = 1024            // Read is PCR duplicate
-       };
-};
-
-// commonly used vector in this library
-typedef vector< BamAlignment > BamAlignmentVector;
-
-#endif /* BAMALIGNMENT_H */
diff --git a/BamAux.h b/BamAux.h

new file mode 100644 (file)

index 0000000..9c088dd
--- /dev/null
+++ b/BamAux.h
@@ -0,0 +1,340 @@
+// ***************************************************************************\r
+// BamAux.h (c) 2009 Derek Barnett, Michael Strömberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 24 June 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Defines common constants, typedefs, & data structures for BamTools.\r
+// ***************************************************************************\r
+\r
+/*! \r
+       \file BamAux.h\r
+       \brief BamTools constants, typedefs, & data structures\r
+*/\r
+\r
+#pragma once\r
+\r
+// C++ includes\r
+#include <exception>\r
+#include <string>\r
+#include <utility>\r
+#include <vector>\r
+\r
+// C includes\r
+#include <cstdio>\r
+#include <cstdlib>\r
+#include <cstring>\r
+\r
+// Platform-specific type definitions\r
+#ifdef WIN32\r
+       typedef char                 int8_t;\r
+       typedef unsigned char       uint8_t;\r
+       typedef short               int16_t;\r
+       typedef unsigned short     uint16_t;\r
+       typedef int                 int32_t;\r
+       typedef unsigned int       uint32_t;\r
+       typedef long long           int64_t;\r
+       typedef unsigned long long uint64_t;\r
+#else\r
+       #include <stdint.h>\r
+#endif\r
+\r
+//! \namespace BamTools\r
+namespace BamTools {\r
+\r
+       //! \cond\r
+       // --------------------------------------------------------------------------------------\r
+       // This section is purely internal and can be excluded from main generated documentation.\r
+       \r
+       // zlib constants\r
+       const int GZIP_ID1   = 31;\r
+       const int GZIP_ID2   = 139;\r
+       const int CM_DEFLATE = 8;\r
+       const int FLG_FEXTRA = 4;\r
+       const int OS_UNKNOWN = 255;\r
+       const int BGZF_XLEN  = 6;\r
+       const int BGZF_ID1   = 66;\r
+       const int BGZF_ID2   = 67;\r
+       const int BGZF_LEN   = 2;\r
+       const int GZIP_WINDOW_BITS = -15;\r
+       const int Z_DEFAULT_MEM_LEVEL = 8;\r
+\r
+       // BZGF constants\r
+       const int BLOCK_HEADER_LENGTH = 18;\r
+       const int BLOCK_FOOTER_LENGTH = 8;\r
+       const int MAX_BLOCK_SIZE      = 65536;\r
+       const int DEFAULT_BLOCK_SIZE  = 65536;\r
+\r
+       // BAM constants\r
+       const int BAM_CORE_SIZE   = 32;\r
+       const int BAM_CMATCH      = 0;\r
+       const int BAM_CINS        = 1;\r
+       const int BAM_CDEL        = 2;\r
+       const int BAM_CREF_SKIP   = 3;\r
+       const int BAM_CSOFT_CLIP  = 4;\r
+       const int BAM_CHARD_CLIP  = 5;\r
+       const int BAM_CPAD        = 6;\r
+       const int BAM_CIGAR_SHIFT = 4;\r
+       const int BAM_CIGAR_MASK  = ((1 << BAM_CIGAR_SHIFT) - 1);\r
+\r
+       // BAM index constants\r
+       const int MAX_BIN           = 37450;    // =(8^6-1)/7+1\r
+       const int BAM_MIN_CHUNK_GAP = 32768;\r
+       const int BAM_LIDX_SHIFT    = 14;\r
+\r
+       // Explicit variable sizes\r
+       const int SIZEOF_INT = 4;\r
+       \r
+       struct BgzfData {\r
+               unsigned int UncompressedBlockSize;\r
+               unsigned int CompressedBlockSize;\r
+               unsigned int BlockLength;\r
+               unsigned int BlockOffset;\r
+               uint64_t BlockAddress;\r
+               bool     IsOpen;\r
+               FILE*    Stream;\r
+               char*    UncompressedBlock;\r
+               char*    CompressedBlock;\r
+               \r
+               // constructor\r
+               BgzfData(void)\r
+                       : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
+                       , CompressedBlockSize(MAX_BLOCK_SIZE)\r
+                       , BlockLength(0)\r
+                       , BlockOffset(0)\r
+                       , BlockAddress(0)\r
+                       , IsOpen(false)\r
+                       , Stream(NULL)\r
+                       , UncompressedBlock(NULL)\r
+                       , CompressedBlock(NULL)\r
+               {\r
+                       try {\r
+                               CompressedBlock   = new char[CompressedBlockSize];\r
+                               UncompressedBlock = new char[UncompressedBlockSize];\r
+                       } catch( std::bad_alloc& ba ) {\r
+                               printf("ERROR: Unable to allocate memory for our BGZF object.\n");\r
+                               exit(1);\r
+                       }\r
+               }\r
+               \r
+               // destructor\r
+               ~BgzfData(void) {\r
+                       if(CompressedBlock)   delete [] CompressedBlock;\r
+                       if(UncompressedBlock) delete [] UncompressedBlock;\r
+               }\r
+       };\r
+       //! \endcond\r
+       \r
+       // --------------------------------------------------------------------------------------\r
+       // Data structures\r
+       \r
+       //! \brief Cigar operation data structure\r
+       struct CigarOp {\r
+               char     Type;   //!< Operation type (MIDNSHP)\r
+               uint32_t Length; //!< Operation length (number of bases)\r
+                  \r
+       };\r
+\r
+       //! Reference sequence data structure\r
+       struct RefData {\r
+               std::string  RefName;          //!< Name of reference sequence\r
+               unsigned int RefLength;        //!< Length of reference sequence\r
+               bool         RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence\r
+               \r
+               // constructor\r
+               RefData(void)\r
+                       : RefLength(0)\r
+                       , RefHasAlignments(false)\r
+               { }\r
+       };\r
+\r
+       //! BAM alignment data structure\r
+       struct BamAlignment {\r
+               \r
+               // Queries against alignment flag\r
+               public:\r
+                       //! Returns true if this read is a PCR duplicate (determined by external app)\r
+                       bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
+                       //! Returns true if this read failed quality control (determined by external app)\r
+                       bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }    \r
+                       //! Returns true if alignment is first mate on read\r
+                       bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
+                       //! Returns true if alignment is mapped                 \r
+                       bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
+                       //! Returns true if alignment's mate is mapped\r
+                       bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }      \r
+                       //! Returns true if alignment's mate mapped to reverse strand\r
+                       bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }\r
+                       //! Returns true if alignment part of paired-end read\r
+                       bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } \r
+                       //! Returns true if this position is primary alignment (determined by external app)\r
+                       bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY) == 0 ); }   \r
+                       //! Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)\r
+                       bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }        \r
+                       //! Returns true if alignment mapped to reverse strand\r
+                       bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } \r
+                       //! Returns true if alignment is second mate on read\r
+                       bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }     \r
+               \r
+               public:\r
+                       /*! \r
+                               \brief Get alignment's read group text.\r
+                               \r
+                               Assigns read group text to readGroup.\r
+                               \r
+                               \return True if read group data successfully retrieved.\r
+                       */\r
+                       bool GetReadGroup(std::string& readGroup) const {\r
+                               \r
+                               if ( TagData.empty() ) { return false; }\r
+                               \r
+                               // localize the tag data\r
+                               char* pTagData = (char*)TagData.data();\r
+                               const unsigned int tagDataLen = TagData.size();\r
+                               unsigned int numBytesParsed = 0;\r
+                               \r
+                               bool foundReadGroupTag = false;\r
+                               while( numBytesParsed < tagDataLen ) {\r
+                                       \r
+                                       const char* pTagType = pTagData;\r
+                                       const char* pTagStorageType = pTagData + 2;\r
+                                       pTagData       += 3;\r
+                                       numBytesParsed += 3;\r
+                                       \r
+                                       // check the current tag\r
+                                       if ( strncmp(pTagType, "RG", 2) == 0 ) {\r
+                                               foundReadGroupTag = true;\r
+                                               break;\r
+                                       }\r
+                                       \r
+                                       // get the storage class and find the next tag\r
+                                       SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );\r
+                               }\r
+                               \r
+                               // return if the read group tag was not present\r
+                               if ( !foundReadGroupTag ) { return false; }\r
+                               \r
+                               // assign the read group\r
+                               const unsigned int readGroupLen = strlen(pTagData);\r
+                               readGroup.resize(readGroupLen);\r
+                               memcpy( (char*)readGroup.data(), pTagData, readGroupLen );\r
+                               return true;\r
+                       }\r
+               \r
+               private:\r
+                       // skips to the next tag\r
+                       static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
+                               switch(storageType) {\r
+                                       \r
+                                       case 'A':\r
+                                       case 'c':\r
+                                       case 'C':\r
+                                               ++numBytesParsed;\r
+                                               ++pTagData;\r
+                                               break;\r
+                                               \r
+                                       case 's':\r
+                                       case 'S':\r
+                                       case 'f':\r
+                                               numBytesParsed += 2;\r
+                                               pTagData       += 2;\r
+                                               break;\r
+                                               \r
+                                       case 'i':\r
+                                       case 'I':\r
+                                               numBytesParsed += 4;\r
+                                               pTagData       += 4;\r
+                                               break;\r
+                                               \r
+                                       case 'Z':\r
+                                       case 'H':\r
+                                               while(*pTagData) {\r
+                                                       ++numBytesParsed;\r
+                                                       ++pTagData;\r
+                                               }\r
+                                               break;\r
+                                               \r
+                                       default:\r
+                                               printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);\r
+                                               exit(1);\r
+                               }\r
+                       }\r
+               \r
+               // Data members\r
+               public:\r
+                       std::string  Name;              //!< Read name\r
+                       unsigned int Length;            //!< Query length\r
+                       std::string  QueryBases;        //!< 'Original' sequence (as reported from sequencing machine)\r
+                       std::string  AlignedBases;      //!< 'Aligned' sequence (includes any indels, padding, clipping) \r
+                       std::string  Qualities;         //!< FASTQ qualities (ASCII characters, not numeric values)\r
+                       std::string  TagData;           //!< Tag data (accessor methods will pull the requested information out)\r
+                       unsigned int RefID;             //!< ID number for reference sequence\r
+                       unsigned int Position;          //!< Position (0-based) where alignment starts \r
+                       unsigned int Bin;               //!< Bin in BAM file where this alignment resides\r
+                       unsigned int MapQuality;        //!< Mapping quality score \r
+                       unsigned int AlignmentFlag;     //!< Alignment bit-flag - see Is<something>() methods for available queries\r
+                       std::vector<CigarOp> CigarData; //!< CIGAR operations for this alignment\r
+                       unsigned int MateRefID;         //!< ID number for reference sequence where alignment's mate was aligned\r
+                       unsigned int MatePosition;      //!< Position (0-based) where alignment's mate starts\r
+                       unsigned int InsertSize;        //!< Mate-pair insert size\r
+               \r
+               // Alignment flag query constants\r
+               private:\r
+                       enum { PAIRED        = 1,\r
+                                  PROPER_PAIR   = 2,\r
+                                  UNMAPPED      = 4,\r
+                                  MATE_UNMAPPED = 8,\r
+                                  REVERSE       = 16,\r
+                                  MATE_REVERSE  = 32,\r
+                                  READ_1        = 64,\r
+                                  READ_2        = 128,\r
+                                  SECONDARY     = 256,\r
+                                  QC_FAILED     = 512,\r
+                                  DUPLICATE     = 1024\r
+                            };\r
+       };\r
+\r
+       // ----------------------------------------------------------------\r
+       // Typedefs\r
+       \r
+       /*!\r
+               \typedef RefVector\r
+               \brief Vector of RefData objects\r
+       */\r
+       typedef std::vector<RefData> RefVector;\r
+       \r
+       /*! \r
+               \typedef BamAlignmentVector\r
+               \brief Vector of BamAlignments\r
+       */\r
+       typedef std::vector< BamAlignment > BamAlignmentVector;\r
+       \r
+       //! \cond\r
+       // ----------------------------------------------------------------\r
+       // Typedefs (internal - can exclude from main documentation)\r
+       \r
+       //Offsets for linear indexing\r
+       typedef std::vector<uint64_t> LinearOffsetVector;\r
+\r
+       // Alignment 'chunk' boundaries\r
+       typedef std::pair<uint64_t, uint64_t> ChunkPair;\r
+       // Vector of alignment 'chunks'\r
+       typedef std::vector<ChunkPair> ChunkVector;\r
+\r
+       // BAM bin contains a bin ID & a vector of alignment 'chunks'\r
+       typedef std::pair<uint32_t, ChunkVector*> BamBin;\r
+       // Vector of BAM bins\r
+       typedef std::vector<BamBin> BinVector;\r
+\r
+       // Reference sequence index data\r
+       typedef std::pair<BinVector*, LinearOffsetVector*> RefIndex;\r
+\r
+       // Full BAM file index data structure \r
+       typedef std::vector<RefIndex*> BamIndex;\r
+       // ----------------------------------------------------------------\r
+       //! \endcond\r
+}\r
diff --git a/BamConversionMain.cpp b/BamConversionMain.cpp

deleted file mode 100644 (file)

index 0e6333b..0000000
--- a/BamConversionMain.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include <iostream>
-#include "BamReader.h"
-#include "BamWriter.h"
-
-using namespace std;
-
-int main(int argc, char* argv[]) {
-
-       if(argc != 3) {
-               cout << "USAGE: " << argv[0] << " <input BAM file> <output BAM file>" << endl;
-               exit(1);
-       }
-
-       // localize our arguments
-       const char* inputFilename  = argv[1];
-       const char* outputFilename = argv[2];
-
-       // open our BAM reader
-       BamReader reader;
-       reader.Open(inputFilename);
-
-       // retrieve the SAM header text
-       string samHeader = reader.GetHeaderText();
-
-       // retrieve the reference sequence vector
-       RefVector referenceSequences = reader.GetReferenceData();
-
-       // open the BAM writer
-       BamWriter writer;
-       writer.Open(outputFilename, samHeader, referenceSequences);
-
-       // copy all of the reads from the input file to the output file
-       BamAlignment al;
-       while(reader.GetNextAlignment(al)) writer.SaveAlignment(al);
-
-       // close our files
-       reader.Close();
-       writer.Close();
-
-       return 0;
-}
diff --git a/BamDumpMain.cpp b/BamDumpMain.cpp

new file mode 100644 (file)

index 0000000..f76852a
--- /dev/null
+++ b/BamDumpMain.cpp
@@ -0,0 +1,55 @@
+// ***************************************************************************\r
+// BamDump.cpp (c) 2009 Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 15 July 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Spits out all alignments in BAM file.\r
+//\r
+// N.B. - Could result in HUGE text file. This is mostly a debugging tool\r
+// for small files.  You have been warned.\r
+// ***************************************************************************\r
+\r
+// Std C/C++ includes\r
+#include <cstdlib>\r
+#include <iostream>\r
+#include <string>\r
+using namespace std;\r
+\r
+// BamTools includes\r
+#include "BamReader.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+\r
+void PrintAlignment(const BamAlignment&);\r
+\r
+int main(int argc, char* argv[]) {\r
+\r
+       // validate argument count\r
+       if( argc != 2 ) {\r
+               cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl;\r
+               exit(1);\r
+       }\r
+\r
+       string filename = argv[1];\r
+       cout << "Printing alignments from file: " << filename << endl;\r
+       \r
+       BamReader reader;\r
+       reader.Open(filename);\r
+       \r
+       BamAlignment bAlignment;\r
+       while (reader.GetNextAlignment(bAlignment)) {\r
+               PrintAlignment(bAlignment);\r
+       }\r
+\r
+       reader.Close();\r
+       return 0;\r
+}\r
+       \r
+// Spit out basic BamAlignment data \r
+void PrintAlignment(const BamAlignment& alignment) {\r
+       cout << "---------------------------------" << endl;\r
+       cout << "Name: " << alignment.Name << endl;\r
+       cout << "Aligned to: " << alignment.RefID << ":" << alignment.Position << endl;\r
+}\r
diff --git a/BamMerge.cpp b/BamMerge.cpp

deleted file mode 100644 (file)

index bfa7908..0000000
--- a/BamMerge.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-
-#include <iostream>
-#include <map>
-#include "BamReader.h"
-#include "BamWriter.h"
-
-using namespace std;
-
-int main(int argc, char* argv[]) {
-
-       for (int a=0; a<argc; a++) {
-               cout << argv[a] << " ";
-       }
-       cout << endl;
-  
-       if(argc < 4) {
-               cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file> [<maxReads>]" << endl;
-               exit(1);
-       }
-
-       // localize our arguments
-       const char* inputFilename1 = argv[1];
-       const char* inputFilename2 = argv[2];
-       const char* outputFilename = argv[3];
-       unsigned int Nmax=0;
-       if (argc>4) {
-               const char* cmaxRead = argv[4];
-               Nmax = atoi(cmaxRead);
-       }
-      
-
-       // open our BAM reader1 for input file 1
-       BamReader reader1;
-       reader1.SetFilename(inputFilename1);
-
-       // open our BAM reader2 for input file 2
-       BamReader reader2;
-       reader2.SetFilename(inputFilename2);
-
-       // check reader1
-       if(!reader1.Open()) {
-               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;
-               exit(1);
-       }
-
-       // check reader2
-       if(!reader2.Open()) {
-               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;
-               exit(1);
-       }
-
-       // retrieve the header text from both files
-       string samHeader1 = reader1.GetHeaderText();
-       string samHeader2 = reader2.GetHeaderText();
-
-       // retrieve the reference sequence vectors
-       RefVector referenceSequences1 = reader1.GetReferenceData();
-       RefVector referenceSequences2 = reader2.GetReferenceData();
-
-  // merged referenceSequences
-  RefVector referenceSequences12;
-  
-       // process reference sequences from file 1
-       map<string, unsigned int, less<string> > refLengthMap;
-       map<string, unsigned int, less<string> > refIdMap;
-       map<string, RefData, less<string> > refDataMap;
-       
-       cerr << "Reference list from file 1" << endl;
-       unsigned int refCounter = 0;
-       for (RefVector::const_iterator refIter = referenceSequences1.begin(); efIter != referenceSequences1.end(); refIter++) {
-
-               // retrieve
-               RefData rd = *refIter;
-
-               // get member data
-               string refName = rd.RefName;
-               unsigned int refLength = rd.RefLength;
-               int refId = reader1.GetRefID(refName);
-
-               // report
-               cerr << " refName=" << refName << " refId=" << refId << endl;
-
-               // store in maps
-               refLengthMap[refName] = refLength;
-               refIdMap[refName] = refCounter;
-               refDataMap[refName] = rd;
-
-               // add this to merged refVector
-               referenceSequences12.push_back(rd);
-
-               // increment ref count
-               refCounter++;
-       }
-
-       // map file 2 refId to merged refID
-       map<unsigned int, unsigned int, less<unsigned int> > mapRefId12;
-
-       cerr << "Reference list from file 2" << endl;
-       for (RefVector::const_iterator refIter = referenceSequences2.begin(); refIter != referenceSequences2.end(); refIter++) {
-
-               // retrieve
-               RefData rd = *refIter;
-
-               // get member data
-               string refName = rd.RefName;
-               unsigned int refLength = rd.RefLength;
-               int refId = reader2.GetRefID(refName);
-
-               // report
-               cerr << " refName=" << refName << " refId=" << refId << endl;
-
-               // if refName already in map, check ref length
-               if (refLengthMap.count(refName) > 0) {
-
-                       // check if length is the same
-                       if (refLengthMap[refName] != refLength) {
-                               cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;
-                               exit(1);
-                       }
-
-                       // make ref id recoding entry
-                       unsigned int refIdNew = refIdMap[refName];
-                       mapRefId12[refId] = refIdNew;
-
-               } else {
-                       // otherwise make new refId and RefData
-                       // store in maps
-                       refLengthMap[refName] = refLength;
-                       refIdMap[refName] = refCounter;
-                       refDataMap[refName] = rd;
-
-                       // make ref id recoding entry
-                       mapRefId12[refId] = refCounter;
-
-                       // add this to merged refVector
-                       referenceSequences12.push_back(rd);
-
-                       // increment ref count
-                       refCounter++;
-               }
-       }
-
-       cerr << "Length of referenceSequences vector=" << referenceSequences12.size() << endl;
-
-       // open output bam file and write:
-       // empty header
-       string samHeader3 = "";
-
-       BamWriter writer;
-       writer.Open(outputFilename, samHeader3, referenceSequences12);
-
-       // iterate through alignments from file 1 and write them unchanged
-       BamAlignment ba;
-
-       bool keepgoing=true;
-       unsigned int ac1 = 0;
-       while(reader1.GetNextAlignment(ba) && keepgoing ) {
-               writer.SaveAlignment(ba);
-               ac1++;
-               keepgoing=(Nmax<1)||(ac1<Nmax);
-       }
-
-       cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;
-
-       // close input file 1
-       reader1.Close();
-
-       // iterate through alignments from file 2 and:
-       // assign re-coded refId
-       // write alignment
-       keepgoing=true;
-       unsigned int ac2 = 0;
-       while(reader2.GetNextAlignment(ba)&&keepgoing ) {
-               ac2++;
-               keepgoing=(Nmax<1)||(ac2<Nmax);
-
-               // retrieve original refId
-               unsigned int refId = ba.RefID;
-
-               // check if original refId is in recoding table... bomb if not
-               if (mapRefId12.count(refId) <= 0) {
-                       cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;
-                       exit(1);
-               }
-
-               // assign new refId
-               ba.RefID = mapRefId12[refId];
-
-               // cerr << " recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;
-
-               writer.SaveAlignment(ba);
-       }
-
-       cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;
-
-       // close input file 2
-       reader2.Close();
-
-       // close output file
-       writer.Close();
-
-       cerr << "Total " << ac1+ac2 << " records to " << outputFilename << endl;
-
-       // return
-       return 0;
-}
diff --git a/BamReader.cpp b/BamReader.cpp

index b5f15ef33a1e65d40b84bb1a9a6de5dc959de0a6..f7d6f3f187ac735f1c14e1263db5df3f180472cf 100644 (file)
--- a/BamReader.cpp
+++ b/BamReader.cpp
@@ -1,19 +1,20 @@
  // ***************************************************************************
-// BamReader (c) 2009 Derek Barnett
-// Marth Lab, Deptartment of Biology, Boston College
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
+// Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
+// Last modified: 15 July 2009 (DB)
+// ---------------------------------------------------------------------------
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
  // Provides the basic functionality for reading BAM files
  // ***************************************************************************
  
+// BamTools includes
  #include "BamReader.h"
-//#include "STLUtilities.h"
-
-#include <cstring>
-
-#include <iostream>
-using std::cerr;
-using std::endl;
+using namespace BamTools;
+using namespace std;
  
  // static character constants
  const char* BamReader::DNA_LOOKUP   = "=ACMGRSVTWYHKDBN";
@@ -31,8 +32,6 @@ BamReader::BamReader(void)
  
  // destructor
  BamReader::~BamReader(void) {
-       m_headerText.clear();
-       ClearIndex();
         Close();
  }
  
@@ -46,7 +45,8 @@ bool BamReader::BgzfCheckBlockHeader(char* header) {
              BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
              header[12] == BGZF_ID1 &&
              header[13] == BGZF_ID2 &&
-            BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN);
+            BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
+                  );
  }
  
  // closes the BAM file
@@ -94,7 +94,7 @@ void BamReader::BgzfOpen(const string& filename) {
  
         m_BGZF.Stream = fopen(filename.c_str(), "rb");
         if(!m_BGZF.Stream) {
-               cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl;
+               printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
                 exit(1);
         }
  
@@ -254,23 +254,27 @@ void BamReader::ClearIndex(void) {
                                         BinVector::iterator binEnd  = (aRef->first)->end();
                                         for ( ; binIter != binEnd; ++binIter ) {
                                                 ChunkVector* chunks = (*binIter).second;
-                                               if ( chunks ) { delete chunks; }
+                                               if ( chunks ) { delete chunks; chunks = NULL;}
                                         }
                                         delete aRef->first;
+                                       aRef->first = NULL;
                                 }
                                 // clear BAM linear offsets
-                               if ( aRef->second ) { delete aRef->second; }
+                               if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
                                 delete aRef;
+                               aRef = NULL;
                         }
                 }
                 delete m_index;
+               m_index = NULL;
         }
  }
  
  // closes the BAM file
  void BamReader::Close(void) {
-       if(m_BGZF.IsOpen) { BgzfClose(); }
-       if(m_index)       { delete m_index; m_index = NULL; }
+       if(m_BGZF.IsOpen) { BgzfClose(); }      
+       ClearIndex();
+       m_headerText.clear();
         m_isRegionSpecified = false;
  }
  
@@ -382,12 +386,12 @@ bool BamReader::IsOverlap(BamAlignment& bAlignment) {
         return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
  }
  
-bool BamReader::Jump(int refID, unsigned int left) {
+bool BamReader::Jump(int refID, unsigned int position) {
  
         // if index available, and region is valid
-       if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { 
+       if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) { 
                 m_currentRefID = refID;
-               m_currentLeft  = left;
+               m_currentLeft  = position;
                 m_isRegionSpecified = true;
                 
                 int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
@@ -629,8 +633,10 @@ bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
                                 case ('S') : k += op.Length;                                                                            // for 'S' - skip over query bases
                                                          break;
                                                          
-                               case ('D') : 
-                               case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'D', 'P' - write padding;
+                               case ('D') : bAlignment.AlignedBases.append( op.Length, '-' );  // for 'D' - write gap character
+                                                        break;
+                                                       
+                               case ('P') : bAlignment.AlignedBases.append( op.Length, '*' );  // for 'P' - write padding character;
                                                          break;
                                                          
                                 case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence
@@ -711,7 +717,7 @@ void BamReader::OpenIndex(const string& indexFilename) {
                 
                 // abort on error
                 if(!indexStream) {
-                       cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl;
+                       printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
                         exit(1);
                 }
         
diff --git a/BamReader.h b/BamReader.h

index 17cb86ab5ad09421230cb6cf34741342082562cb..d829891e45dc5a11f8b83d61a2e2403330e86be1 100644 (file)
--- a/BamReader.h
+++ b/BamReader.h
@@ -1,259 +1,274 @@
  // ***************************************************************************
-// BamReader (c) 2009 Derek Barnett
-// Marth Lab, Deptartment of Biology, Boston College
+// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg
+// Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
  // Provides the basic functionality for reading BAM files
  // ***************************************************************************
  
-#pragma once
+/*! 
+       \file BamReader.h
+       \brief API for reading BAM files.
+*/
  
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <zlib.h>
+#pragma once
  
+// C++ includes
  #include <algorithm>
+#include <iterator>
  #include <string>
  #include <utility>
  #include <vector>
-using namespace std;
-
-#include "BamAlignment.h"
-
-// our zlib constants
-#define GZIP_ID1             31
-#define GZIP_ID2            139
-#define CM_DEFLATE            8
-#define FLG_FEXTRA            4
-#define OS_UNKNOWN          255
-#define BGZF_XLEN             6
-#define BGZF_ID1             66
-#define BGZF_ID2             67
-#define BGZF_LEN              2
-#define GZIP_WINDOW_BITS    -15
-#define Z_DEFAULT_MEM_LEVEL   8
-
-// our BZGF constants
-#define BLOCK_HEADER_LENGTH    18
-#define BLOCK_FOOTER_LENGTH     8
-#define MAX_BLOCK_SIZE      65536
-#define DEFAULT_BLOCK_SIZE  65536
  
-// our BAM constants
-#define BAM_CORE_SIZE  32
-#define BAM_CMATCH      0
-#define BAM_CINS        1
-#define BAM_CDEL        2
-#define BAM_CREF_SKIP   3
-#define BAM_CSOFT_CLIP  4
-#define BAM_CHARD_CLIP  5
-#define BAM_CPAD        6
-#define BAM_CIGAR_SHIFT 4
-#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+// zlib includes
+#include <zlib.h>
  
-// BAM indexing constants
-#define MAX_BIN           37450 // =(8^6-1)/7+1
-#define BAM_MIN_CHUNK_GAP 32768 
-#define BAM_LIDX_SHIFT    14
+// BamTools includes
+#include "BamAux.h"
  
-// our variable sizes
-#define SIZEOF_INT 4
+namespace BamTools { 
  
-// define our BZGF structure
-#ifndef BGZF_DATA
-#define BGZF_DATA
-struct BgzfData {
-       unsigned int UncompressedBlockSize;
-       unsigned int CompressedBlockSize;
-       unsigned int BlockLength;
-       unsigned int BlockOffset;
-       uint64_t BlockAddress;
-       bool IsOpen;
-       FILE* Stream;
-       char* UncompressedBlock;
-       char* CompressedBlock;
+       //! API for reading BAM files.
+       class BamReader {
+               
+               public:
+                       
+                       //! Constructor
+                       BamReader(void);
+                       
+                       //! Destructor
+                       ~BamReader(void);
+               
+               public:
+                       
+                       /*! 
+                               \brief Closes the BAM file.
+                               
+                               Also closes index file and clears index data, if provided.
+                               
+                               \sa Open()
+                       */
+                       void Close(void);
+                       
+                       /*! 
+                               \brief Access SAM format header data.
+                               
+                               See SAM format documentation for detailed header description.
+                               
+                               \return Full header text (no parsing of tags)
+                       */
+                       const std::string GetHeaderText(void) const;
+                       
+                       /*! 
+                               \brief Retrieve next alignment.
+                               
+                               Stores result in bAlignment.  
+                               
+                               If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either: 
+                               a) overlaps, or 
+                               b) begins on/after that specified position.
+                               
+                               Otherwise this simply stores next alignment, if one exists.
+                               
+                               Note that this method does not specifiy a 'right bound' position.  
+                               If a range is desired, you should supply some stopping criteria. For example:
+                               
+                               \code
+                               BamReader bReader;
+                               bReader.Open(bamFile, bamIndexfile);
+                               if ( bReader.Jump( someID, somePosition ) {
+                                       BamAlignment bAlignment;
+                                       while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) {
+                                               // do something
+                                       }
+                               }
+                               \endcode
+                               
+                               \param bAlignment destination for alignment data
+                               \return success/failure
+                               \sa Jump(), Rewind()
+                       */
+                       bool GetNextAlignment(BamAlignment& bAlignment);
+                       
+                       /*! 
+                               \brief Get number of reference sequences in BAM file.
+                               \return Number of references
+                               \sa GetReferenceData(), GetReferenceID()
+                       */
+                       const int GetReferenceCount(void) const;
+                       
+                       /*! 
+                               \brief Access reference data.
+                               \return Vector of RefData entry
+                               \sa GetReferenceCount(), GetReferenceID()
+                       */
+                       const RefVector GetReferenceData(void) const;
+                       
+                       /*! 
+                               \brief Get reference ID from name.
+                               \param refName name of reference sequence
+                               \return reference ID number
+                               \sa GetReferenceCount(), GetReferenceData()
+                       */
+                       const int GetReferenceID(const std::string& refName) const;
+                       
+                       /*! 
+                               \brief Random access in BAM file.
+                               
+                               Jump to a specified position on reference sequence. 
+                               Position is optional - defaults to beginning of specified reference.
+                               
+                               Reference and position are stored for use by subsequent calls to GetNextAlignment().
+                               
+                               \param refID ID number of desired reference
+                               \param position left-bound position  
+                               \return success/failure
+                               \sa GetNextAlignment(), Rewind()
+                       */
+                       bool Jump(int refID, unsigned int position = 0);
+                       
+                       /*! 
+                               \brief Opens a BAM file.
+                               
+                               Index file is optional - sequential reading through a BAM file does not require an index.
+                               
+                               However, the index is required to perform random-access alignment retrival (using the Jump() method ).  
+                               
+                               See SAMtools documentation for BAM index generation.
+                               
+                               \param filename BAM file
+                               \param indexFilename BAM index file
+                               \sa Jump(), Close()
+                       */
+                       void Open(const std::string& filename, const std::string& indexFilename = "");
+                       
+                       /*!
+                               \brief Moves file pointer to beginning of alignment data.
+                               
+                               A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file.
+                               Clears any reference and position set by a prior call to Jump()
+                               
+                               \return success/failure
+                               \sa GetNextAlignment(), Jump()
+                       */
+                       bool Rewind(void);
+               
+               // --------------------------------------------------------------------------------------
+               // internal methods
+               private:
+                       // checks BGZF block header
+                       bool BgzfCheckBlockHeader(char* header);
+                       // closes the BAM file
+                       void BgzfClose(void);
+                       // de-compresses the current block
+                       int BgzfInflateBlock(int blockLength);
+                       // opens the BAM file for reading
+                       void BgzfOpen(const std::string& filename);
+                       // reads BGZF data into a byte buffer
+                       unsigned int BgzfRead(char* data, const unsigned int dataLen);
+                       // reads BGZF block
+                       int BgzfReadBlock(void);
+                       // seek to position in BAM file
+                       bool BgzfSeek(int64_t position);
+                       // get file position in BAM file
+                       int64_t BgzfTell(void);
+                       // unpacks a buffer into an unsigned int
+                       static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
+                       // unpacks a buffer into an unsigned short
+                       static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
+                       // calculate bins that overlap region ( left to reference end for now )
+                       int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
+                       // calculates alignment end position based on starting position and provided CIGAR operations
+                       unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);
+                       // clear out (delete pointers in) index data structure
+                       void ClearIndex(void);
+                       // calculate file offset for first alignment chunk overlapping 'left'
+                       int64_t GetOffset(int refID, unsigned int left);
+                       // checks to see if alignment overlaps current region
+                       bool IsOverlap(BamAlignment& bAlignment);
+                       // retrieves header text from BAM file
+                       void LoadHeaderData(void);
+                       // builds BamIndex data structure from BAM index file
+                       void LoadIndexData(FILE* indexStream);
+                       // retrieves BAM alignment under file pointer
+                       bool LoadNextAlignment(BamAlignment& bAlignment);
+                       // builds reference data structure from BAM file
+                       void LoadReferenceData(void);
+                       // open BAM index file (if successful, loads index)
+                       void OpenIndex(const std::string& indexFilename);
+               
+               // aligment file & index file data
+               private:
+                       BgzfData    m_BGZF;
+                       std::string m_headerText;
+                       BamIndex*   m_index;
+                       RefVector   m_references;
+                       bool        m_isIndexLoaded;
+                       int64_t     m_alignmentsBeginOffset;
+               
+               // user-specified region values
+               private:
+                       bool         m_isRegionSpecified;
+                       int          m_currentRefID;
+                       unsigned int m_currentLeft;
+               
+               // BAM character constants
+               private:
+                       static const char* DNA_LOOKUP;
+                       static const char* CIGAR_LOOKUP;
+       };
  
-       // constructor
-       BgzfData(void)
-               : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
-               , CompressedBlockSize(MAX_BLOCK_SIZE)
-               , BlockLength(0)
-               , BlockOffset(0)
-               , BlockAddress(0)
-               , IsOpen(false)
-               , Stream(NULL)
-               , UncompressedBlock(NULL)
-               , CompressedBlock(NULL)
-       {
-               try {
-                       CompressedBlock   = new char[CompressedBlockSize];
-                       UncompressedBlock = new char[UncompressedBlockSize];
-               } catch(bad_alloc&) {
-                       printf("ERROR: Unable to allocate memory for our BGZF object.\n");
-                       exit(1);
-               }
+       //! \cond
+       // --------------------------------------------------------------------------------------
+       // static inline methods (internal - can exclude from main documentation)
+       
+       // unpacks a buffer into an unsigned int
+       inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
+               union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+               un.valueBuffer[0] = buffer[0];
+               un.valueBuffer[1] = buffer[1];
+               un.valueBuffer[2] = buffer[2];
+               un.valueBuffer[3] = buffer[3];
+               return un.value;
         }
  
-       // destructor
-       ~BgzfData(void) {
-               if(CompressedBlock)   delete [] CompressedBlock;
-               if(UncompressedBlock) delete [] UncompressedBlock;
+       // unpacks a buffer into an unsigned short
+       inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
+               union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
+               un.valueBuffer[0] = buffer[0];
+               un.valueBuffer[1] = buffer[1];
+               return un.value;
         }
-};
-#endif // BGZF_DATA
-
-
-// --------------------------- //
-// BamIndex-related typedefs
-// --------------------------- //
-
-// offset for linear indexing
-typedef vector<uint64_t> LinearOffsetVector;
-
-// alignment 'chunk' boundaries
-typedef pair<uint64_t, uint64_t> ChunkPair;
-typedef vector<ChunkPair> ChunkVector;
-
-// BAM bins.. each contains (binID, alignment 'chunks')
-typedef pair<uint32_t, ChunkVector*> BamBin;
-typedef vector<BamBin> BinVector;
  
-// each reference sequence has a BinVector and LinearOffsetVector
-typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
-
-// full BamIndex defined as: 
-typedef vector<RefIndex*> BamIndex;
-
-class BamReader {
-
-       // constructor/destructor
-       public:
-               BamReader(void);
-               ~BamReader(void);
-
-       // public interface
-       public:
-               // closes the BAM file
-               void Close(void);
-               // retrieves header text
-               const string GetHeaderText(void) const;
-               // saves the alignment to the alignment archive
-               bool GetNextAlignment(BamAlignment& bAlignment);
-               // return number of reference sequences in BAM file
-               const int GetReferenceCount(void) const;
-               // return vector of RefData entries
-               const RefVector GetReferenceData(void) const;
-               // get refID from reference name
-               const int GetReferenceID(const string& refName) const;
-               // jumps to 'left' position on refID
-               bool Jump(int refID, unsigned int left = 0);
-               // opens the BAM file
-               void Open(const string& filename, const string& indexFilename = "");
-               // move file pointer back to first alignment
-               bool Rewind(void);
-
-       // internal methods
-       private:
-               // checks BGZF block header
-               bool BgzfCheckBlockHeader(char* header);
-               // closes the BAM file
-               void BgzfClose(void);
-               // de-compresses the current block
-               int BgzfInflateBlock(int blockLength);
-               // opens the BAM file for reading
-               void BgzfOpen(const string& filename);
-               // reads BGZF data into a byte buffer
-               unsigned int BgzfRead(char* data, const unsigned int dataLen);
-               // reads BGZF block
-               int BgzfReadBlock(void);
-               // seek to position in BAM file
-               bool BgzfSeek(int64_t position);
-               // get file position in BAM file
-               int64_t BgzfTell(void);
-               // unpacks a buffer into an unsigned int
-               static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
-               // unpacks a buffer into an unsigned short
-               static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
-               //      calculate bins that overlap region ( left to reference end for now )
-               int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
-               // calculates alignment end position based on starting position and provided CIGAR operations
-               unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
-               // clear out (delete pointers in) index data structure
-               void ClearIndex(void);
-               // calculate file offset for first alignment chunk overlapping 'left'
-               int64_t GetOffset(int refID, unsigned int left);
-               // checks to see if alignment overlaps current region
-               bool IsOverlap(BamAlignment& bAlignment);
-               // retrieves header text from BAM file
-               void LoadHeaderData(void);
-               // builds BamIndex data structure from BAM index file
-               void LoadIndexData(FILE* indexStream);
-               // retrieves BAM alignment under file pointer
-               bool LoadNextAlignment(BamAlignment& bAlignment);
-               // builds reference data structure from BAM file
-               void LoadReferenceData(void);
-               // open BAM index file (if successful, loads index)
-               void OpenIndex(const string& indexFilename);
+       // --------------------------------------------------------------------------------------
+       // template classes/methods (internal - can exclude from main documentation)
         
-       // aligment file & index file data
-       private:
-               BgzfData  m_BGZF;
-               string    m_headerText;
-               BamIndex* m_index;
-               RefVector m_references;
-               bool      m_isIndexLoaded;
-               int64_t   m_alignmentsBeginOffset;
-
-       // user-specified region values
-       private:
-               bool         m_isRegionSpecified;
-               int          m_currentRefID;
-               unsigned int m_currentLeft;
-
-       // BAM character constants
-       private:
-               static const char* DNA_LOOKUP;
-               static const char* CIGAR_LOOKUP;
-};
+       // allows sorting/searching of a vector of pairs (instead of using maps)
+       template <typename Key, typename Value>
+       class LookupKeyCompare {
  
-// unpacks a buffer into an unsigned int
-inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
-       union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
-       un.valueBuffer[0] = buffer[0];
-       un.valueBuffer[1] = buffer[1];
-       un.valueBuffer[2] = buffer[2];
-       un.valueBuffer[3] = buffer[3];
-       return un.value;
-}
-
-// unpacks a buffer into an unsigned short
-inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
-       union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
-       un.valueBuffer[0] = buffer[0];
-       un.valueBuffer[1] = buffer[1];
-       return un.value;
-}
-
-// allows sorting/searching of a vector of pairs (instead of using maps)
-template <typename Key, typename Value>
-class LookupKeyCompare {
+               typedef std::pair< Key, Value > LookupData;
+               typedef typename LookupData::first_type Key_t;
+               
+               public:
+                       bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
+                       bool operator() (const LookupData& lhs, const Key_t& k) const        { return keyLess(lhs.first, k); }
+                       bool operator() (const Key_t& k, const LookupData& rhs) const        { return keyLess(k, rhs.first); }
+               private:
+                       bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
+       };
  
-       typedef pair< Key, Value > LookupData;
-       typedef typename LookupData::first_type Key_t;
+       // return index of item if found, else return container.size()
+       template < typename InputIterator, typename EqualityComparable >
+       typename std::iterator_traits<InputIterator>::difference_type
+       Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
+               return std::distance(begin, std::find(begin, end, item));
+       }
+       //! \endcond
         
-       public:
-               bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
-               bool operator() (const LookupData& lhs, const Key_t& k) const        { return keyLess(lhs.first, k); }
-               bool operator() (const Key_t& k, const LookupData& rhs) const        { return keyLess(k, rhs.first); }
-       private:
-               bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
-};
-
-// return index of item if found, else return container.size()
-template < typename InputIterator, typename EqualityComparable >
-typename iterator_traits<InputIterator>::difference_type
-Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
-       return distance(begin, find(begin, end, item));
  }
-
diff --git a/BamReaderMain.cpp b/BamReaderMain.cpp

deleted file mode 100644 (file)

index d568fed..0000000
--- a/BamReaderMain.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-// BamReaderMain.cpp\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 12 May 2009\r
-\r
-#include "BamReader.h"\r
-\r
-// C++ includes\r
-#include <iostream>\r
-using std::cerr;\r
-using std::cout;\r
-using std::endl;\r
-\r
-#include <vector>\r
-using std::vector;\r
-\r
-#include <string>\r
-using std::string;\r
-\r
-int main(int argc, char* argv[]) {\r
-\r
-       // check command line parameters\r
-       if (argc != 3) {\r
-               cerr << endl;\r
-               cerr << "Usage: BamReaderTest <bamFile> <bamIndexFile>" << endl;\r
-               cerr << endl;\r
-               exit(1);\r
-       }\r
-\r
-       // get filenames from command line\r
-       const char* bamFilename = argv[1];\r
-       const char* bamIndexFilename = argv[2];\r
-\r
-       int alignmentsRead = 0;\r
-       BamAlignment bAlignment;\r
-       \r
-       BamReader bReader;\r
-       cerr << endl << "Opening BAM file (and index)" << endl << endl;\r
-       bReader.Open(bamFilename, bamIndexFilename);\r
-\r
-       string header = bReader.GetHeaderText();\r
-       cerr << "Printing header text..." << endl;\r
-       if ( header.empty() ) {\r
-               cerr << "No header provided." << endl << endl;\r
-       } else {\r
-               cerr << "----------------------" << endl;\r
-               cerr << "Header Text: " << endl;\r
-               cerr << "----------------------" << endl;\r
-               cerr << header << endl << endl;\r
-       }\r
-\r
-       RefVector references = bReader.GetReferenceData();\r
-       cerr << "Printing references..." << endl;\r
-       RefVector::const_iterator refIter = references.begin();\r
-       RefVector::const_iterator refEnd  = references.end();\r
-       for ( ; refIter != refEnd; ++refIter) {\r
-               cerr << "Reference entry: " << endl;\r
-               cerr << (*refIter).RefName << endl;\r
-               cerr << (*refIter).RefLength << endl;\r
-               cerr << "Has alignments? : " << ( ((*refIter).RefHasAlignments) ? "yes" : "no" ) << endl;\r
-       }\r
-       cerr << endl;\r
-\r
-       alignmentsRead = 0;\r
-       while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
-               cerr << "Alignment " << alignmentsRead << endl;\r
-               cerr << bAlignment.Name << endl;\r
-               cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
-               ++alignmentsRead;\r
-       }\r
-\r
-       cerr << "Jumping in BAM file" << endl;\r
-       if ( bReader.Jump(1, 5000) ) {\r
-               cerr << "Jumping seems ok - getting first 10 alignments that start at or after chr2:5000" << endl;\r
-\r
-               alignmentsRead = 0;\r
-               while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
-                       if ( bAlignment.Position >= 5000 ) { \r
-                               cerr << "Alignment " << alignmentsRead << endl;\r
-                               cerr << bAlignment.Name << endl;\r
-                               cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
-                               ++alignmentsRead;\r
-                       }\r
-               }\r
-       }\r
-\r
-       cerr << "Rewinding BAM file" << endl;\r
-       if ( bReader.Rewind() ) {\r
-               cerr << "Rewind seems to be ok - getting first 10 alignments" << endl;\r
-               \r
-               alignmentsRead = 0;\r
-               while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
-                       cerr << "Alignment " << alignmentsRead << endl;\r
-                       cerr << bAlignment.Name << endl;\r
-                       cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
-                       ++alignmentsRead;\r
-               }\r
-       }\r
-\r
-       cerr << "Closing BAM file..." << endl << endl;\r
-       bReader.Close();\r
-\r
-       cerr << "Exiting..." << endl << endl;\r
-       return 0;\r
-}\r
diff --git a/BamTrimMain.cpp b/BamTrimMain.cpp

new file mode 100644 (file)

index 0000000..7349115
--- /dev/null
+++ b/BamTrimMain.cpp
@@ -0,0 +1,91 @@
+// ***************************************************************************\r
+// BamTrimMain.cpp (c) 2009 Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 15 July 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Basic example of reading/writing BAM files. Pulls alignments overlapping \r
+// the range specified by user from one BAM file and writes to a new BAM file.\r
+// ***************************************************************************\r
+\r
+// Std C/C++ includes\r
+#include <cstdlib>\r
+#include <iostream>\r
+#include <string>\r
+using namespace std;\r
+\r
+// BamTools includes\r
+#include "BamReader.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+\r
+int main(int argc, char* argv[]) {\r
+\r
+       // validate argument count\r
+       if( argc != 7 ) {\r
+               cerr << "USAGE: " << argv[0] << " <input BAM file> <input BAM index file> <output BAM file> <reference name> <leftBound> <rightBound> " << endl;\r
+               exit(1);\r
+       }\r
+\r
+       // store arguments\r
+       string inBamFilename  = argv[1];\r
+       string indexFilename  = argv[2];\r
+       string outBamFilename = argv[3];\r
+       string referenceName  = argv[4];\r
+       string leftBound_str  = argv[5];\r
+       string rightBound_str = argv[6];\r
+\r
+       // open our BAM reader\r
+       BamReader reader;\r
+       reader.Open(inBamFilename, indexFilename);\r
+\r
+       // get header & reference information\r
+       string header = reader.GetHeaderText();\r
+       RefVector references = reader.GetReferenceData();\r
+       \r
+       // open our BAM writer\r
+       BamWriter writer;\r
+       writer.Open(outBamFilename, header, references);\r
+\r
+       // get reference ID from name\r
+       int refID = 0;\r
+       RefVector::const_iterator refIter = references.begin();\r
+       RefVector::const_iterator refEnd  = references.end();\r
+       for ( ; refIter != refEnd; ++refIter ) {\r
+               if ( (*refIter).RefName == referenceName ) { break; }\r
+               ++refID;\r
+       }\r
+       \r
+       // validate ID\r
+       if ( refIter == refEnd ) {\r
+               cerr << "Reference: " << referenceName << " not found." << endl;\r
+               exit(1);\r
+       }\r
+\r
+       // convert boundary arguments to numeric values\r
+       unsigned int leftBound  = (unsigned int) atoi( leftBound_str.c_str()  );\r
+       unsigned int rightBound = (unsigned int) atoi( rightBound_str.c_str() );\r
+       \r
+       // attempt jump to range of interest\r
+       if ( reader.Jump(refID, leftBound) ) {\r
+               \r
+               // while data exists and alignment begin before right bound\r
+               BamAlignment bAlignment;\r
+               while ( reader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) {\r
+                       // save alignment to archive\r
+                       writer.SaveAlignment(bAlignment);\r
+               }\r
+       } \r
+       \r
+       // if jump failed\r
+       else {\r
+               cerr << "Could not jump to ref:pos " << referenceName << ":" << leftBound << endl;\r
+               exit(1);\r
+       }\r
+\r
+       // clean up and exit\r
+       reader.Close();\r
+       writer.Close(); \r
+       return 0;\r
+}
+\ No newline at end of file
diff --git a/BamWriter.cpp b/BamWriter.cpp

index 4f6a37f3592d8426be5d52da0a4e6c3257b90295..32e7047c1779e8101f404378b7df43756fd23a8b 100644 (file)
--- a/BamWriter.cpp
+++ b/BamWriter.cpp
@@ -1,8 +1,10 @@
  // ***************************************************************************
-// BamWriter (c) 2009 Michael Strömberg
-// Marth Lab, Deptartment of Biology, Boston College
+// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
  // The BGZF routines were adapted from the bgzf.c code developed at the Broad
  // Institute.
  // ---------------------------------------------------------------------------
@@ -10,6 +12,8 @@
  // ***************************************************************************
  
  #include "BamWriter.h"
+using namespace BamTools;
+using namespace std;
  
  // constructor
  BamWriter::BamWriter(void)
diff --git a/BamWriter.h b/BamWriter.h

index 18f8e0a08e2ab7e9f2c5cb70fff2263e3ba50c87..5294dc93bc3449acb32d73dd998f277dcb2e76ad 100644 (file)
--- a/BamWriter.h
+++ b/BamWriter.h
@@ -1,151 +1,114 @@
  // ***************************************************************************
-// BamWriter (c) 2009 Michael Strömberg
-// Marth Lab, Deptartment of Biology, Boston College
+// BamWriter.h (c) 2009 Michael Strömberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
  // All rights reserved.
  // ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
  // The BGZF routines were adapted from the bgzf.c code developed at the Broad
  // Institute.
  // ---------------------------------------------------------------------------
  // Provides the basic functionality for producing BAM files
  // ***************************************************************************
  
+/*! 
+       \file BamWriter.h
+       \brief API for writing BAM files.
+*/
+
  #pragma once
  
+// C++ includes
  #include <string>
  #include <vector>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <zlib.h>
-#include "BamAlignment.h"
-
-using namespace std;
-
-// our zlib constants
-#define GZIP_ID1             31
-#define GZIP_ID2            139
-#define CM_DEFLATE            8
-#define FLG_FEXTRA            4
-#define OS_UNKNOWN          255
-#define BGZF_XLEN             6
-#define BGZF_ID1             66
-#define BGZF_ID2             67
-#define BGZF_LEN              2
-#define GZIP_WINDOW_BITS    -15
-#define Z_DEFAULT_MEM_LEVEL   8
-
-// our BZGF constants
-#define BLOCK_HEADER_LENGTH    18
-#define BLOCK_FOOTER_LENGTH     8
-#define MAX_BLOCK_SIZE      65536
-#define DEFAULT_BLOCK_SIZE  65536
-
-// our BAM constants
-#define BAM_CORE_SIZE  32
-#define BAM_CMATCH      0
-#define BAM_CINS        1
-#define BAM_CDEL        2
-#define BAM_CREF_SKIP   3
-#define BAM_CSOFT_CLIP  4
-#define BAM_CHARD_CLIP  5
-#define BAM_CPAD        6
-#define BAM_CIGAR_SHIFT 4
  
-#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+// zlib includes
+#include <zlib.h>
  
-// our variable sizes
-#define SIZEOF_INT 4
+// BamTools includes
+#include "BamAux.h"
  
-// define our BZGF structure
-#ifndef BGZF_DATA
-#define BGZF_DATA
-struct BgzfData {
-       unsigned int UncompressedBlockSize;
-       unsigned int CompressedBlockSize;
-       unsigned int BlockLength;
-       unsigned int BlockOffset;
-       uint64_t BlockAddress;
-       bool IsOpen;
-       FILE* Stream;
-       char* UncompressedBlock;
-       char* CompressedBlock;
+namespace BamTools { 
  
-       // constructor
-       BgzfData(void)
-               : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
-               , CompressedBlockSize(MAX_BLOCK_SIZE)
-               , BlockLength(0)
-               , BlockOffset(0)
-               , BlockAddress(0)
-               , IsOpen(false)
-               , Stream(NULL)
-               , UncompressedBlock(NULL)
-               , CompressedBlock(NULL)
-       {
-               try {
-                       CompressedBlock   = new char[CompressedBlockSize];
-                       UncompressedBlock = new char[UncompressedBlockSize];
-               } catch(bad_alloc&) {
-                       printf("ERROR: Unable to allocate memory for our BGZF object.\n");
-                       exit(1);
-               }
-       }
+       //! API for writing BAM files.
+       class BamWriter {
+               
+               public:
+                       
+                       //! Constructor.
+                       BamWriter(void);
+                       
+                       //! Destructor.
+                       ~BamWriter(void);
+               
+               public:
+                       
+                       /*! 
+                               \brief Closes the alignment archive
+                               \sa Open()
+                       */
+                       void Close(void);
+                       
+                       /*! 
+                               \brief Opens the alignment archive
+                               \param filename output BAM file
+                               \param samHeader SAM-format header text
+                               \param referenceSequences Reference sequence data 
+                               \sa Close()
+                       */
+                       void Open(const std::string& filename, const std::string& samHeader, const RefVector& referenceSequences);
+                       
+                       /*! 
+                               \brief Saves an alignment to the archive
+                               \param al The BamAlignment to be saved
+                       */
+                       void SaveAlignment(const BamAlignment& al);
+               
+               // --------------------------------------------------------------------------------------
+               // internal methods
+               private:
+                       // closes the BAM file
+                       void BgzfClose(void);
+                       // compresses the current block
+                       int BgzfDeflateBlock(void);
+                       // flushes the data in the BGZF block
+                       void BgzfFlushBlock(void);
+                       // opens the BAM file for writing
+                       void BgzfOpen(const std::string& filename);
+                       // packs an unsigned integer into the specified buffer
+                       static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
+                       // packs an unsigned short into the specified buffer
+                       static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
+                       // writes the supplied data into the BGZF buffer
+                       unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
+                       // calculates the minimum bin that contains a region [begin, end)
+                       static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
+                       // creates a packed cigar string from the supplied alignment
+                       static void CreatePackedCigar(const std::vector<CigarOp>& cigarOperations, std::string& packedCigar);
+                       // encodes the supplied query sequence into 4-bit notation
+                       static void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
+                       // our BGZF output object
+                       BgzfData mBGZF;
+       };
  
-       // destructor
-       ~BgzfData(void) {
-               if(CompressedBlock)   delete [] CompressedBlock;
-               if(UncompressedBlock) delete [] UncompressedBlock;
+       //! \cond
+       // --------------------------------------------------------------------------------------
+       // static inline methods (internal - can exclude from main documentation)
+       
+       // packs an unsigned integer into the specified buffer
+       inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
+               buffer[0] = (char)value;
+               buffer[1] = (char)(value >> 8);
+               buffer[2] = (char)(value >> 16);
+               buffer[3] = (char)(value >> 24);
         }
-};
-#endif // BGZF_DATA
  
-class BamWriter {
-public:
-       // constructor
-       BamWriter(void);
-       // destructor
-       ~BamWriter(void);
-       // closes the alignment archive
-       void Close(void);
-       // opens the alignment archive
-       void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
-       // saves the alignment to the alignment archive
-       void SaveAlignment(const BamAlignment& al);
-private:
-       // closes the BAM file
-       void BgzfClose(void);
-       // compresses the current block
-       int BgzfDeflateBlock(void);
-       // flushes the data in the BGZF block
-       void BgzfFlushBlock(void);
-       // opens the BAM file for writing
-       void BgzfOpen(const string& filename);
-       // packs an unsigned integer into the specified buffer
-       static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
         // packs an unsigned short into the specified buffer
-       static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
-       // writes the supplied data into the BGZF buffer
-       unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
-       // calculates the minimum bin that contains a region [begin, end)
-       static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
-       // creates a packed cigar string from the supplied alignment
-       static void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
-       // encodes the supplied query sequence into 4-bit notation
-       static void EncodeQuerySequence(const string& query, string& encodedQuery);
-       // our BGZF output object
-       BgzfData mBGZF;
-};
-
-// packs an unsigned integer into the specified buffer
-inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
-       buffer[0] = (char)value;
-       buffer[1] = (char)(value >> 8);
-       buffer[2] = (char)(value >> 16);
-       buffer[3] = (char)(value >> 24);
-}
-
-// packs an unsigned short into the specified buffer
-inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
-       buffer[0] = (char)value;
-       buffer[1] = (char)(value >> 8);
-}
+       inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
+               buffer[0] = (char)value;
+               buffer[1] = (char)(value >> 8);
+       }
+       // --------------------------------------------------------------------------------------
+       //! \endcond
+       
+} // end BamTools namespace
+\ No newline at end of file
diff --git a/Makefile b/Makefile

index 40e3ecdfe9afcd48a0adad9964a334c3c3245276..a056ae5e84279b1104486ec7f745740fcd90e39c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,15 @@
  CXX=           g++\r
  CXXFLAGS=      -Wall -O3\r
-PROG=          BamReaderTest BamConversion\r
+PROG=          BamDump BamTrim\r
  LIBS=          -lz\r
  \r
  all: $(PROG)\r
  \r
-BamReaderTest: BamReader.o BamReaderMain.o\r
-       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamReaderMain.o $(LIBS)\r
+BamDump: BamReader.o BamDumpMain.o\r
+       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamDumpMain.o $(LIBS)\r
  \r
-BamConversion: BamReader.o BamWriter.o BamConversionMain.o\r
-       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamConversionMain.o $(LIBS)\r
-\r
-BamMerge: BamReader.o BamWriter.o BamMerge.o\r
-       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamMerge.o $(LIBS)\r
+BamTrim: BamReader.o BamWriter.o BamTrimMain.o\r
+       $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamTrimMain.o $(LIBS)\r
  \r
  clean:\r
-               rm -fr gmon.out *.o *.a a.out *~\r
+       rm -fr gmon.out *.o *.a a.out *~\r
diff --git a/README b/README

index 9553d1ac96ecee5acef3bce6235edf4cc1c6e04e..a523b4c1d707e6b5f010609638e024cc142d5527 100644 (file)
--- a/README
+++ b/README
@@ -1,28 +1,37 @@
----------------
-README
-----------------
-
-
-Significant re-write of BamReader API as of 5/18/2009:
--------------------------------------------------------
-No longer using bgzf.h/.c
-No longer using libbambc.a library
-No longer using STL Utilities
-Minor changes to API syntax in opening BAM files with BamReader - see BamReader.h for details
-
-
-Compile instructions
-----------------------
-Short version:
-See Makefile for compilation example.
-
-Longer version:
-Copy BamAlignment.h, BamReader.* and/or BamWriter.* to working directory.
-#include "BamReader.h" and/or "BamWriter.h"
-Use standard compile commands, including the "-lz" parameter to access zlib functionality.
-(Note that "-L. -lbambc" are no longer necessary and WILL CAUSE PROBLEMS for compiler/linker )
-
-
-Feel free to email me or pop by my desk with any questions, comments, suggestions, etc.
-  - Derek 
-
+------------------------------------------------------------
+    README
+------------------------------------------------------------
+
+BamTools: a C++ API for reading/writing BAM files.
+
+The API consists of 2 main modules - BamReader and BamWriter. As you would expect, 
+BamReader provides read-access to BAM files, while BamWriter does the writing of BAM
+files. BamReader provides an interface for random-access (jumping) in a BAM file.
+ 
+An additional file, BamAux.h, is included as well.  
+This file contains the common data structures and typedefs used throught the API.
+
+------------------------------------------------------------
+
+To use this API, you simply need to do 3 things:
+
+    1 - Drop the BamTools files somewhere the compiler can find them.
+           (i.e. in source directory, or somewhere in include path)
+       
+       2 - Import BamTools API with the following lines of code
+           
+           #include "BamReader.h"     // as needed
+               #include "BamWriter.h"     // as needed
+           using namespace BamTools;
+               
+       3 - Compile with '-lz' ('l' as in 'lion') to access ZLIB compression library
+
+See any included programs and Makefile for more specific compiling/usage examples.
+See documentation & comments in header files for API details. 
+
+------------------------------------------------------------
+
+Feel free to contact me with any questions, comments, suggestions, bug reports, etc.
+  - Derek Barnett
+  
+http://sourceforge.net/projects/bamtools
author	barnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
	Wed, 15 Jul 2009 18:02:28 +0000 (18:02 +0000)
committer	barnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
	Wed, 15 Jul 2009 18:02:28 +0000 (18:02 +0000)
BamAlignment.h	[deleted file]	patch \| blob \| history
BamAux.h	[new file with mode: 0644]	patch \| blob
BamConversionMain.cpp	[deleted file]	patch \| blob \| history
BamDumpMain.cpp	[new file with mode: 0644]	patch \| blob
BamMerge.cpp	[deleted file]	patch \| blob \| history
BamReader.cpp		patch \| blob \| history
BamReader.h		patch \| blob \| history
BamReaderMain.cpp	[deleted file]	patch \| blob \| history
BamTrimMain.cpp	[new file with mode: 0644]	patch \| blob
BamWriter.cpp		patch \| blob \| history
BamWriter.h		patch \| blob \| history
Makefile		patch \| blob \| history
README		patch \| blob \| history