+++ /dev/null
-// BamAlignment.h
-
-// Derek Barnett
-// Marth Lab, Boston College
-// Last modified: 20 March 2009
-
-#ifndef BAMALIGNMENT_H
-#define BAMALIGNMENT_H
-
-#include <string.h>
-#include <stdlib.h>
-
-#ifdef WIN32
-typedef char int8_t;
-typedef unsigned char uint8_t;
-typedef short int16_t;
-typedef unsigned short uint16_t;
-typedef int int32_t;
-typedef unsigned int uint32_t;
-typedef long long int64_t;
-typedef unsigned long long uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-// C++ includes
-#include <string>
-using std::string;
-
-#include <vector>
-using std::vector;
-
-struct CigarOp {
- uint32_t Length;
- char Type;
-};
-
-struct RefData {
- string RefName;
- unsigned int RefLength;
- bool RefHasAlignments;
-
- // constructor
- RefData(void)
- : RefLength(0)
- , RefHasAlignments(false)
- { }
-};
-
-typedef vector<RefData> RefVector;
-
-struct BamAlignment {
-
- // queries against alignment flag - see below for further detail
-public:
- bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
- bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
- bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
- bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
- bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
- bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
- bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
- bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
- bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
- bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
- bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
-
- // returns true and assigns the read group if present in the tag data
- bool GetReadGroup(string& readGroup) const {
-
- if(TagData.empty()) return false;
-
- // localize the tag data
- char* pTagData = (char*)TagData.data();
- const unsigned int tagDataLen = TagData.size();
- unsigned int numBytesParsed = 0;
-
- bool foundReadGroupTag = false;
- while(numBytesParsed < tagDataLen) {
-
- const char* pTagType = pTagData;
- const char* pTagStorageType = pTagData + 2;
- pTagData += 3;
- numBytesParsed += 3;
-
- // check the current tag
- if(strncmp(pTagType, "RG", 2) == 0) {
- foundReadGroupTag = true;
- break;
- }
-
- // get the storage class and find the next tag
- SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed);
- }
-
- // return if the read group tag was not present
- if(!foundReadGroupTag) return false;
-
- // assign the read group
- const unsigned int readGroupLen = strlen(pTagData);
- readGroup.resize(readGroupLen);
- memcpy((char*)readGroup.data(), pTagData, readGroupLen);
- return true;
- }
-
- // skips to the next tag
- static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
- switch(storageType) {
- case 'A':
- case 'c':
- case 'C':
- numBytesParsed++;
- pTagData++;
- break;
- case 's':
- case 'S':
- case 'f':
- numBytesParsed += 2;
- pTagData += 2;
- break;
- case 'i':
- case 'I':
- numBytesParsed += 4;
- pTagData += 4;
- break;
- case 'Z':
- case 'H':
- while(*pTagData) {
- numBytesParsed++;
- pTagData++;
- }
- break;
- default:
- printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
- exit(1);
- }
- }
-
- // data members
-public:
- string Name; // read name
- unsigned int Length; // query length
- string QueryBases; // original sequence ( produced from machine )
- string AlignedBases; // aligned sequence ( with indels )
- string Qualities; // FASTQ qualities ( still in ASCII characters )
- string TagData; // contains the tag data (accessor methods will pull the requested information out)
- unsigned int RefID; // ID for reference sequence
- unsigned int Position; // position on reference sequence where alignment starts
- unsigned int Bin; // bin in BAM file where this alignment resides
- unsigned int MapQuality; // mapping quality
- unsigned int AlignmentFlag; // see above for available queries
- vector<CigarOp> CigarData; // vector of CIGAR operations (length & type) )
- unsigned int MateRefID; // ID for reference sequence that mate was aligned to
- unsigned int MatePosition; // position that mate was aligned to
- unsigned int InsertSize; // mate pair insert size
-
- // alignment flag query constants
-private:
- enum { PAIRED = 1, // Alignment comes from paired-end data
- PROPER_PAIR = 2, // Alignment passed paired-end resolution
- UNMAPPED = 4, // Read is unmapped
- MATE_UNMAPPED = 8, // Mate is unmapped
- REVERSE = 16, // Read is on reverse strand
- MATE_REVERSE = 32, // Mate is on reverse strand
- READ_1 = 64, // This alignment is mate 1 of pair
- READ_2 = 128, // This alignment is mate 2 of pair
- SECONDARY = 256, // This alignment is not the primary (best) alignment for read
- QC_FAILED = 512, // Read did not pass prior quality control steps
- DUPLICATE = 1024 // Read is PCR duplicate
- };
-};
-
-// commonly used vector in this library
-typedef vector< BamAlignment > BamAlignmentVector;
-
-#endif /* BAMALIGNMENT_H */
--- /dev/null
+// ***************************************************************************\r
+// BamAux.h (c) 2009 Derek Barnett, Michael Strömberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 24 June 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Defines common constants, typedefs, & data structures for BamTools.\r
+// ***************************************************************************\r
+\r
+/*! \r
+ \file BamAux.h\r
+ \brief BamTools constants, typedefs, & data structures\r
+*/\r
+\r
+#pragma once\r
+\r
+// C++ includes\r
+#include <exception>\r
+#include <string>\r
+#include <utility>\r
+#include <vector>\r
+\r
+// C includes\r
+#include <cstdio>\r
+#include <cstdlib>\r
+#include <cstring>\r
+\r
+// Platform-specific type definitions\r
+#ifdef WIN32\r
+ typedef char int8_t;\r
+ typedef unsigned char uint8_t;\r
+ typedef short int16_t;\r
+ typedef unsigned short uint16_t;\r
+ typedef int int32_t;\r
+ typedef unsigned int uint32_t;\r
+ typedef long long int64_t;\r
+ typedef unsigned long long uint64_t;\r
+#else\r
+ #include <stdint.h>\r
+#endif\r
+\r
+//! \namespace BamTools\r
+namespace BamTools {\r
+\r
+ //! \cond\r
+ // --------------------------------------------------------------------------------------\r
+ // This section is purely internal and can be excluded from main generated documentation.\r
+ \r
+ // zlib constants\r
+ const int GZIP_ID1 = 31;\r
+ const int GZIP_ID2 = 139;\r
+ const int CM_DEFLATE = 8;\r
+ const int FLG_FEXTRA = 4;\r
+ const int OS_UNKNOWN = 255;\r
+ const int BGZF_XLEN = 6;\r
+ const int BGZF_ID1 = 66;\r
+ const int BGZF_ID2 = 67;\r
+ const int BGZF_LEN = 2;\r
+ const int GZIP_WINDOW_BITS = -15;\r
+ const int Z_DEFAULT_MEM_LEVEL = 8;\r
+\r
+ // BZGF constants\r
+ const int BLOCK_HEADER_LENGTH = 18;\r
+ const int BLOCK_FOOTER_LENGTH = 8;\r
+ const int MAX_BLOCK_SIZE = 65536;\r
+ const int DEFAULT_BLOCK_SIZE = 65536;\r
+\r
+ // BAM constants\r
+ const int BAM_CORE_SIZE = 32;\r
+ const int BAM_CMATCH = 0;\r
+ const int BAM_CINS = 1;\r
+ const int BAM_CDEL = 2;\r
+ const int BAM_CREF_SKIP = 3;\r
+ const int BAM_CSOFT_CLIP = 4;\r
+ const int BAM_CHARD_CLIP = 5;\r
+ const int BAM_CPAD = 6;\r
+ const int BAM_CIGAR_SHIFT = 4;\r
+ const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);\r
+\r
+ // BAM index constants\r
+ const int MAX_BIN = 37450; // =(8^6-1)/7+1\r
+ const int BAM_MIN_CHUNK_GAP = 32768;\r
+ const int BAM_LIDX_SHIFT = 14;\r
+\r
+ // Explicit variable sizes\r
+ const int SIZEOF_INT = 4;\r
+ \r
+ struct BgzfData {\r
+ unsigned int UncompressedBlockSize;\r
+ unsigned int CompressedBlockSize;\r
+ unsigned int BlockLength;\r
+ unsigned int BlockOffset;\r
+ uint64_t BlockAddress;\r
+ bool IsOpen;\r
+ FILE* Stream;\r
+ char* UncompressedBlock;\r
+ char* CompressedBlock;\r
+ \r
+ // constructor\r
+ BgzfData(void)\r
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
+ , CompressedBlockSize(MAX_BLOCK_SIZE)\r
+ , BlockLength(0)\r
+ , BlockOffset(0)\r
+ , BlockAddress(0)\r
+ , IsOpen(false)\r
+ , Stream(NULL)\r
+ , UncompressedBlock(NULL)\r
+ , CompressedBlock(NULL)\r
+ {\r
+ try {\r
+ CompressedBlock = new char[CompressedBlockSize];\r
+ UncompressedBlock = new char[UncompressedBlockSize];\r
+ } catch( std::bad_alloc& ba ) {\r
+ printf("ERROR: Unable to allocate memory for our BGZF object.\n");\r
+ exit(1);\r
+ }\r
+ }\r
+ \r
+ // destructor\r
+ ~BgzfData(void) {\r
+ if(CompressedBlock) delete [] CompressedBlock;\r
+ if(UncompressedBlock) delete [] UncompressedBlock;\r
+ }\r
+ };\r
+ //! \endcond\r
+ \r
+ // --------------------------------------------------------------------------------------\r
+ // Data structures\r
+ \r
+ //! \brief Cigar operation data structure\r
+ struct CigarOp {\r
+ char Type; //!< Operation type (MIDNSHP)\r
+ uint32_t Length; //!< Operation length (number of bases)\r
+ \r
+ };\r
+\r
+ //! Reference sequence data structure\r
+ struct RefData {\r
+ std::string RefName; //!< Name of reference sequence\r
+ unsigned int RefLength; //!< Length of reference sequence\r
+ bool RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence\r
+ \r
+ // constructor\r
+ RefData(void)\r
+ : RefLength(0)\r
+ , RefHasAlignments(false)\r
+ { }\r
+ };\r
+\r
+ //! BAM alignment data structure\r
+ struct BamAlignment {\r
+ \r
+ // Queries against alignment flag\r
+ public:\r
+ //! Returns true if this read is a PCR duplicate (determined by external app)\r
+ bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
+ //! Returns true if this read failed quality control (determined by external app)\r
+ bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } \r
+ //! Returns true if alignment is first mate on read\r
+ bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
+ //! Returns true if alignment is mapped \r
+ bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
+ //! Returns true if alignment's mate is mapped\r
+ bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } \r
+ //! Returns true if alignment's mate mapped to reverse strand\r
+ bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
+ //! Returns true if alignment part of paired-end read\r
+ bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } \r
+ //! Returns true if this position is primary alignment (determined by external app)\r
+ bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } \r
+ //! Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)\r
+ bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } \r
+ //! Returns true if alignment mapped to reverse strand\r
+ bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } \r
+ //! Returns true if alignment is second mate on read\r
+ bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } \r
+ \r
+ public:\r
+ /*! \r
+ \brief Get alignment's read group text.\r
+ \r
+ Assigns read group text to readGroup.\r
+ \r
+ \return True if read group data successfully retrieved.\r
+ */\r
+ bool GetReadGroup(std::string& readGroup) const {\r
+ \r
+ if ( TagData.empty() ) { return false; }\r
+ \r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLen = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ bool foundReadGroupTag = false;\r
+ while( numBytesParsed < tagDataLen ) {\r
+ \r
+ const char* pTagType = pTagData;\r
+ const char* pTagStorageType = pTagData + 2;\r
+ pTagData += 3;\r
+ numBytesParsed += 3;\r
+ \r
+ // check the current tag\r
+ if ( strncmp(pTagType, "RG", 2) == 0 ) {\r
+ foundReadGroupTag = true;\r
+ break;\r
+ }\r
+ \r
+ // get the storage class and find the next tag\r
+ SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );\r
+ }\r
+ \r
+ // return if the read group tag was not present\r
+ if ( !foundReadGroupTag ) { return false; }\r
+ \r
+ // assign the read group\r
+ const unsigned int readGroupLen = strlen(pTagData);\r
+ readGroup.resize(readGroupLen);\r
+ memcpy( (char*)readGroup.data(), pTagData, readGroupLen );\r
+ return true;\r
+ }\r
+ \r
+ private:\r
+ // skips to the next tag\r
+ static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
+ switch(storageType) {\r
+ \r
+ case 'A':\r
+ case 'c':\r
+ case 'C':\r
+ ++numBytesParsed;\r
+ ++pTagData;\r
+ break;\r
+ \r
+ case 's':\r
+ case 'S':\r
+ case 'f':\r
+ numBytesParsed += 2;\r
+ pTagData += 2;\r
+ break;\r
+ \r
+ case 'i':\r
+ case 'I':\r
+ numBytesParsed += 4;\r
+ pTagData += 4;\r
+ break;\r
+ \r
+ case 'Z':\r
+ case 'H':\r
+ while(*pTagData) {\r
+ ++numBytesParsed;\r
+ ++pTagData;\r
+ }\r
+ break;\r
+ \r
+ default:\r
+ printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);\r
+ exit(1);\r
+ }\r
+ }\r
+ \r
+ // Data members\r
+ public:\r
+ std::string Name; //!< Read name\r
+ unsigned int Length; //!< Query length\r
+ std::string QueryBases; //!< 'Original' sequence (as reported from sequencing machine)\r
+ std::string AlignedBases; //!< 'Aligned' sequence (includes any indels, padding, clipping) \r
+ std::string Qualities; //!< FASTQ qualities (ASCII characters, not numeric values)\r
+ std::string TagData; //!< Tag data (accessor methods will pull the requested information out)\r
+ unsigned int RefID; //!< ID number for reference sequence\r
+ unsigned int Position; //!< Position (0-based) where alignment starts \r
+ unsigned int Bin; //!< Bin in BAM file where this alignment resides\r
+ unsigned int MapQuality; //!< Mapping quality score \r
+ unsigned int AlignmentFlag; //!< Alignment bit-flag - see Is<something>() methods for available queries\r
+ std::vector<CigarOp> CigarData; //!< CIGAR operations for this alignment\r
+ unsigned int MateRefID; //!< ID number for reference sequence where alignment's mate was aligned\r
+ unsigned int MatePosition; //!< Position (0-based) where alignment's mate starts\r
+ unsigned int InsertSize; //!< Mate-pair insert size\r
+ \r
+ // Alignment flag query constants\r
+ private:\r
+ enum { PAIRED = 1,\r
+ PROPER_PAIR = 2,\r
+ UNMAPPED = 4,\r
+ MATE_UNMAPPED = 8,\r
+ REVERSE = 16,\r
+ MATE_REVERSE = 32,\r
+ READ_1 = 64,\r
+ READ_2 = 128,\r
+ SECONDARY = 256,\r
+ QC_FAILED = 512,\r
+ DUPLICATE = 1024\r
+ };\r
+ };\r
+\r
+ // ----------------------------------------------------------------\r
+ // Typedefs\r
+ \r
+ /*!\r
+ \typedef RefVector\r
+ \brief Vector of RefData objects\r
+ */\r
+ typedef std::vector<RefData> RefVector;\r
+ \r
+ /*! \r
+ \typedef BamAlignmentVector\r
+ \brief Vector of BamAlignments\r
+ */\r
+ typedef std::vector< BamAlignment > BamAlignmentVector;\r
+ \r
+ //! \cond\r
+ // ----------------------------------------------------------------\r
+ // Typedefs (internal - can exclude from main documentation)\r
+ \r
+ //Offsets for linear indexing\r
+ typedef std::vector<uint64_t> LinearOffsetVector;\r
+\r
+ // Alignment 'chunk' boundaries\r
+ typedef std::pair<uint64_t, uint64_t> ChunkPair;\r
+ // Vector of alignment 'chunks'\r
+ typedef std::vector<ChunkPair> ChunkVector;\r
+\r
+ // BAM bin contains a bin ID & a vector of alignment 'chunks'\r
+ typedef std::pair<uint32_t, ChunkVector*> BamBin;\r
+ // Vector of BAM bins\r
+ typedef std::vector<BamBin> BinVector;\r
+\r
+ // Reference sequence index data\r
+ typedef std::pair<BinVector*, LinearOffsetVector*> RefIndex;\r
+\r
+ // Full BAM file index data structure \r
+ typedef std::vector<RefIndex*> BamIndex;\r
+ // ----------------------------------------------------------------\r
+ //! \endcond\r
+}\r
+++ /dev/null
-#include <iostream>
-#include "BamReader.h"
-#include "BamWriter.h"
-
-using namespace std;
-
-int main(int argc, char* argv[]) {
-
- if(argc != 3) {
- cout << "USAGE: " << argv[0] << " <input BAM file> <output BAM file>" << endl;
- exit(1);
- }
-
- // localize our arguments
- const char* inputFilename = argv[1];
- const char* outputFilename = argv[2];
-
- // open our BAM reader
- BamReader reader;
- reader.Open(inputFilename);
-
- // retrieve the SAM header text
- string samHeader = reader.GetHeaderText();
-
- // retrieve the reference sequence vector
- RefVector referenceSequences = reader.GetReferenceData();
-
- // open the BAM writer
- BamWriter writer;
- writer.Open(outputFilename, samHeader, referenceSequences);
-
- // copy all of the reads from the input file to the output file
- BamAlignment al;
- while(reader.GetNextAlignment(al)) writer.SaveAlignment(al);
-
- // close our files
- reader.Close();
- writer.Close();
-
- return 0;
-}
--- /dev/null
+// ***************************************************************************\r
+// BamDump.cpp (c) 2009 Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 15 July 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Spits out all alignments in BAM file.\r
+//\r
+// N.B. - Could result in HUGE text file. This is mostly a debugging tool\r
+// for small files. You have been warned.\r
+// ***************************************************************************\r
+\r
+// Std C/C++ includes\r
+#include <cstdlib>\r
+#include <iostream>\r
+#include <string>\r
+using namespace std;\r
+\r
+// BamTools includes\r
+#include "BamReader.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+\r
+void PrintAlignment(const BamAlignment&);\r
+\r
+int main(int argc, char* argv[]) {\r
+\r
+ // validate argument count\r
+ if( argc != 2 ) {\r
+ cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl;\r
+ exit(1);\r
+ }\r
+\r
+ string filename = argv[1];\r
+ cout << "Printing alignments from file: " << filename << endl;\r
+ \r
+ BamReader reader;\r
+ reader.Open(filename);\r
+ \r
+ BamAlignment bAlignment;\r
+ while (reader.GetNextAlignment(bAlignment)) {\r
+ PrintAlignment(bAlignment);\r
+ }\r
+\r
+ reader.Close();\r
+ return 0;\r
+}\r
+ \r
+// Spit out basic BamAlignment data \r
+void PrintAlignment(const BamAlignment& alignment) {\r
+ cout << "---------------------------------" << endl;\r
+ cout << "Name: " << alignment.Name << endl;\r
+ cout << "Aligned to: " << alignment.RefID << ":" << alignment.Position << endl;\r
+}\r
+++ /dev/null
-
-#include <iostream>
-#include <map>
-#include "BamReader.h"
-#include "BamWriter.h"
-
-using namespace std;
-
-int main(int argc, char* argv[]) {
-
- for (int a=0; a<argc; a++) {
- cout << argv[a] << " ";
- }
- cout << endl;
-
- if(argc < 4) {
- cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file> [<maxReads>]" << endl;
- exit(1);
- }
-
- // localize our arguments
- const char* inputFilename1 = argv[1];
- const char* inputFilename2 = argv[2];
- const char* outputFilename = argv[3];
- unsigned int Nmax=0;
- if (argc>4) {
- const char* cmaxRead = argv[4];
- Nmax = atoi(cmaxRead);
- }
-
-
- // open our BAM reader1 for input file 1
- BamReader reader1;
- reader1.SetFilename(inputFilename1);
-
- // open our BAM reader2 for input file 2
- BamReader reader2;
- reader2.SetFilename(inputFilename2);
-
- // check reader1
- if(!reader1.Open()) {
- cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;
- exit(1);
- }
-
- // check reader2
- if(!reader2.Open()) {
- cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;
- exit(1);
- }
-
- // retrieve the header text from both files
- string samHeader1 = reader1.GetHeaderText();
- string samHeader2 = reader2.GetHeaderText();
-
- // retrieve the reference sequence vectors
- RefVector referenceSequences1 = reader1.GetReferenceData();
- RefVector referenceSequences2 = reader2.GetReferenceData();
-
- // merged referenceSequences
- RefVector referenceSequences12;
-
- // process reference sequences from file 1
- map<string, unsigned int, less<string> > refLengthMap;
- map<string, unsigned int, less<string> > refIdMap;
- map<string, RefData, less<string> > refDataMap;
-
- cerr << "Reference list from file 1" << endl;
- unsigned int refCounter = 0;
- for (RefVector::const_iterator refIter = referenceSequences1.begin(); efIter != referenceSequences1.end(); refIter++) {
-
- // retrieve
- RefData rd = *refIter;
-
- // get member data
- string refName = rd.RefName;
- unsigned int refLength = rd.RefLength;
- int refId = reader1.GetRefID(refName);
-
- // report
- cerr << " refName=" << refName << " refId=" << refId << endl;
-
- // store in maps
- refLengthMap[refName] = refLength;
- refIdMap[refName] = refCounter;
- refDataMap[refName] = rd;
-
- // add this to merged refVector
- referenceSequences12.push_back(rd);
-
- // increment ref count
- refCounter++;
- }
-
- // map file 2 refId to merged refID
- map<unsigned int, unsigned int, less<unsigned int> > mapRefId12;
-
- cerr << "Reference list from file 2" << endl;
- for (RefVector::const_iterator refIter = referenceSequences2.begin(); refIter != referenceSequences2.end(); refIter++) {
-
- // retrieve
- RefData rd = *refIter;
-
- // get member data
- string refName = rd.RefName;
- unsigned int refLength = rd.RefLength;
- int refId = reader2.GetRefID(refName);
-
- // report
- cerr << " refName=" << refName << " refId=" << refId << endl;
-
- // if refName already in map, check ref length
- if (refLengthMap.count(refName) > 0) {
-
- // check if length is the same
- if (refLengthMap[refName] != refLength) {
- cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;
- exit(1);
- }
-
- // make ref id recoding entry
- unsigned int refIdNew = refIdMap[refName];
- mapRefId12[refId] = refIdNew;
-
- } else {
- // otherwise make new refId and RefData
- // store in maps
- refLengthMap[refName] = refLength;
- refIdMap[refName] = refCounter;
- refDataMap[refName] = rd;
-
- // make ref id recoding entry
- mapRefId12[refId] = refCounter;
-
- // add this to merged refVector
- referenceSequences12.push_back(rd);
-
- // increment ref count
- refCounter++;
- }
- }
-
- cerr << "Length of referenceSequences vector=" << referenceSequences12.size() << endl;
-
- // open output bam file and write:
- // empty header
- string samHeader3 = "";
-
- BamWriter writer;
- writer.Open(outputFilename, samHeader3, referenceSequences12);
-
- // iterate through alignments from file 1 and write them unchanged
- BamAlignment ba;
-
- bool keepgoing=true;
- unsigned int ac1 = 0;
- while(reader1.GetNextAlignment(ba) && keepgoing ) {
- writer.SaveAlignment(ba);
- ac1++;
- keepgoing=(Nmax<1)||(ac1<Nmax);
- }
-
- cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;
-
- // close input file 1
- reader1.Close();
-
- // iterate through alignments from file 2 and:
- // assign re-coded refId
- // write alignment
- keepgoing=true;
- unsigned int ac2 = 0;
- while(reader2.GetNextAlignment(ba)&&keepgoing ) {
- ac2++;
- keepgoing=(Nmax<1)||(ac2<Nmax);
-
- // retrieve original refId
- unsigned int refId = ba.RefID;
-
- // check if original refId is in recoding table... bomb if not
- if (mapRefId12.count(refId) <= 0) {
- cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;
- exit(1);
- }
-
- // assign new refId
- ba.RefID = mapRefId12[refId];
-
- // cerr << " recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;
-
- writer.SaveAlignment(ba);
- }
-
- cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;
-
- // close input file 2
- reader2.Close();
-
- // close output file
- writer.Close();
-
- cerr << "Total " << ac1+ac2 << " records to " << outputFilename << endl;
-
- // return
- return 0;
-}
// ***************************************************************************
-// BamReader (c) 2009 Derek Barnett
-// Marth Lab, Deptartment of Biology, Boston College
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
+// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
+// Last modified: 15 July 2009 (DB)
+// ---------------------------------------------------------------------------
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
// Provides the basic functionality for reading BAM files
// ***************************************************************************
+// BamTools includes
#include "BamReader.h"
-//#include "STLUtilities.h"
-
-#include <cstring>
-
-#include <iostream>
-using std::cerr;
-using std::endl;
+using namespace BamTools;
+using namespace std;
// static character constants
const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
// destructor
BamReader::~BamReader(void) {
- m_headerText.clear();
- ClearIndex();
Close();
}
BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
header[12] == BGZF_ID1 &&
header[13] == BGZF_ID2 &&
- BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN);
+ BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
+ );
}
// closes the BAM file
m_BGZF.Stream = fopen(filename.c_str(), "rb");
if(!m_BGZF.Stream) {
- cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl;
+ printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
exit(1);
}
BinVector::iterator binEnd = (aRef->first)->end();
for ( ; binIter != binEnd; ++binIter ) {
ChunkVector* chunks = (*binIter).second;
- if ( chunks ) { delete chunks; }
+ if ( chunks ) { delete chunks; chunks = NULL;}
}
delete aRef->first;
+ aRef->first = NULL;
}
// clear BAM linear offsets
- if ( aRef->second ) { delete aRef->second; }
+ if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
delete aRef;
+ aRef = NULL;
}
}
delete m_index;
+ m_index = NULL;
}
}
// closes the BAM file
void BamReader::Close(void) {
- if(m_BGZF.IsOpen) { BgzfClose(); }
- if(m_index) { delete m_index; m_index = NULL; }
+ if(m_BGZF.IsOpen) { BgzfClose(); }
+ ClearIndex();
+ m_headerText.clear();
m_isRegionSpecified = false;
}
return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
}
-bool BamReader::Jump(int refID, unsigned int left) {
+bool BamReader::Jump(int refID, unsigned int position) {
// if index available, and region is valid
- if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) {
+ if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) {
m_currentRefID = refID;
- m_currentLeft = left;
+ m_currentLeft = position;
m_isRegionSpecified = true;
int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
case ('S') : k += op.Length; // for 'S' - skip over query bases
break;
- case ('D') :
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding;
+ case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character
+ break;
+
+ case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;
break;
case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
// abort on error
if(!indexStream) {
- cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl;
+ printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
exit(1);
}
// ***************************************************************************
-// BamReader (c) 2009 Derek Barnett
-// Marth Lab, Deptartment of Biology, Boston College
+// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg
+// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
+// The BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
// Provides the basic functionality for reading BAM files
// ***************************************************************************
-#pragma once
+/*!
+ \file BamReader.h
+ \brief API for reading BAM files.
+*/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <zlib.h>
+#pragma once
+// C++ includes
#include <algorithm>
+#include <iterator>
#include <string>
#include <utility>
#include <vector>
-using namespace std;
-
-#include "BamAlignment.h"
-
-// our zlib constants
-#define GZIP_ID1 31
-#define GZIP_ID2 139
-#define CM_DEFLATE 8
-#define FLG_FEXTRA 4
-#define OS_UNKNOWN 255
-#define BGZF_XLEN 6
-#define BGZF_ID1 66
-#define BGZF_ID2 67
-#define BGZF_LEN 2
-#define GZIP_WINDOW_BITS -15
-#define Z_DEFAULT_MEM_LEVEL 8
-
-// our BZGF constants
-#define BLOCK_HEADER_LENGTH 18
-#define BLOCK_FOOTER_LENGTH 8
-#define MAX_BLOCK_SIZE 65536
-#define DEFAULT_BLOCK_SIZE 65536
-// our BAM constants
-#define BAM_CORE_SIZE 32
-#define BAM_CMATCH 0
-#define BAM_CINS 1
-#define BAM_CDEL 2
-#define BAM_CREF_SKIP 3
-#define BAM_CSOFT_CLIP 4
-#define BAM_CHARD_CLIP 5
-#define BAM_CPAD 6
-#define BAM_CIGAR_SHIFT 4
-#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+// zlib includes
+#include <zlib.h>
-// BAM indexing constants
-#define MAX_BIN 37450 // =(8^6-1)/7+1
-#define BAM_MIN_CHUNK_GAP 32768
-#define BAM_LIDX_SHIFT 14
+// BamTools includes
+#include "BamAux.h"
-// our variable sizes
-#define SIZEOF_INT 4
+namespace BamTools {
-// define our BZGF structure
-#ifndef BGZF_DATA
-#define BGZF_DATA
-struct BgzfData {
- unsigned int UncompressedBlockSize;
- unsigned int CompressedBlockSize;
- unsigned int BlockLength;
- unsigned int BlockOffset;
- uint64_t BlockAddress;
- bool IsOpen;
- FILE* Stream;
- char* UncompressedBlock;
- char* CompressedBlock;
+ //! API for reading BAM files.
+ class BamReader {
+
+ public:
+
+ //! Constructor
+ BamReader(void);
+
+ //! Destructor
+ ~BamReader(void);
+
+ public:
+
+ /*!
+ \brief Closes the BAM file.
+
+ Also closes index file and clears index data, if provided.
+
+ \sa Open()
+ */
+ void Close(void);
+
+ /*!
+ \brief Access SAM format header data.
+
+ See SAM format documentation for detailed header description.
+
+ \return Full header text (no parsing of tags)
+ */
+ const std::string GetHeaderText(void) const;
+
+ /*!
+ \brief Retrieve next alignment.
+
+ Stores result in bAlignment.
+
+ If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either:
+ a) overlaps, or
+ b) begins on/after that specified position.
+
+ Otherwise this simply stores next alignment, if one exists.
+
+ Note that this method does not specifiy a 'right bound' position.
+ If a range is desired, you should supply some stopping criteria. For example:
+
+ \code
+ BamReader bReader;
+ bReader.Open(bamFile, bamIndexfile);
+ if ( bReader.Jump( someID, somePosition ) {
+ BamAlignment bAlignment;
+ while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) {
+ // do something
+ }
+ }
+ \endcode
+
+ \param bAlignment destination for alignment data
+ \return success/failure
+ \sa Jump(), Rewind()
+ */
+ bool GetNextAlignment(BamAlignment& bAlignment);
+
+ /*!
+ \brief Get number of reference sequences in BAM file.
+ \return Number of references
+ \sa GetReferenceData(), GetReferenceID()
+ */
+ const int GetReferenceCount(void) const;
+
+ /*!
+ \brief Access reference data.
+ \return Vector of RefData entry
+ \sa GetReferenceCount(), GetReferenceID()
+ */
+ const RefVector GetReferenceData(void) const;
+
+ /*!
+ \brief Get reference ID from name.
+ \param refName name of reference sequence
+ \return reference ID number
+ \sa GetReferenceCount(), GetReferenceData()
+ */
+ const int GetReferenceID(const std::string& refName) const;
+
+ /*!
+ \brief Random access in BAM file.
+
+ Jump to a specified position on reference sequence.
+ Position is optional - defaults to beginning of specified reference.
+
+ Reference and position are stored for use by subsequent calls to GetNextAlignment().
+
+ \param refID ID number of desired reference
+ \param position left-bound position
+ \return success/failure
+ \sa GetNextAlignment(), Rewind()
+ */
+ bool Jump(int refID, unsigned int position = 0);
+
+ /*!
+ \brief Opens a BAM file.
+
+ Index file is optional - sequential reading through a BAM file does not require an index.
+
+ However, the index is required to perform random-access alignment retrival (using the Jump() method ).
+
+ See SAMtools documentation for BAM index generation.
+
+ \param filename BAM file
+ \param indexFilename BAM index file
+ \sa Jump(), Close()
+ */
+ void Open(const std::string& filename, const std::string& indexFilename = "");
+
+ /*!
+ \brief Moves file pointer to beginning of alignment data.
+
+ A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file.
+ Clears any reference and position set by a prior call to Jump()
+
+ \return success/failure
+ \sa GetNextAlignment(), Jump()
+ */
+ bool Rewind(void);
+
+ // --------------------------------------------------------------------------------------
+ // internal methods
+ private:
+ // checks BGZF block header
+ bool BgzfCheckBlockHeader(char* header);
+ // closes the BAM file
+ void BgzfClose(void);
+ // de-compresses the current block
+ int BgzfInflateBlock(int blockLength);
+ // opens the BAM file for reading
+ void BgzfOpen(const std::string& filename);
+ // reads BGZF data into a byte buffer
+ unsigned int BgzfRead(char* data, const unsigned int dataLen);
+ // reads BGZF block
+ int BgzfReadBlock(void);
+ // seek to position in BAM file
+ bool BgzfSeek(int64_t position);
+ // get file position in BAM file
+ int64_t BgzfTell(void);
+ // unpacks a buffer into an unsigned int
+ static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
+ // unpacks a buffer into an unsigned short
+ static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
+ // calculate bins that overlap region ( left to reference end for now )
+ int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
+ // calculates alignment end position based on starting position and provided CIGAR operations
+ unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);
+ // clear out (delete pointers in) index data structure
+ void ClearIndex(void);
+ // calculate file offset for first alignment chunk overlapping 'left'
+ int64_t GetOffset(int refID, unsigned int left);
+ // checks to see if alignment overlaps current region
+ bool IsOverlap(BamAlignment& bAlignment);
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // builds BamIndex data structure from BAM index file
+ void LoadIndexData(FILE* indexStream);
+ // retrieves BAM alignment under file pointer
+ bool LoadNextAlignment(BamAlignment& bAlignment);
+ // builds reference data structure from BAM file
+ void LoadReferenceData(void);
+ // open BAM index file (if successful, loads index)
+ void OpenIndex(const std::string& indexFilename);
+
+ // aligment file & index file data
+ private:
+ BgzfData m_BGZF;
+ std::string m_headerText;
+ BamIndex* m_index;
+ RefVector m_references;
+ bool m_isIndexLoaded;
+ int64_t m_alignmentsBeginOffset;
+
+ // user-specified region values
+ private:
+ bool m_isRegionSpecified;
+ int m_currentRefID;
+ unsigned int m_currentLeft;
+
+ // BAM character constants
+ private:
+ static const char* DNA_LOOKUP;
+ static const char* CIGAR_LOOKUP;
+ };
- // constructor
- BgzfData(void)
- : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
- , CompressedBlockSize(MAX_BLOCK_SIZE)
- , BlockLength(0)
- , BlockOffset(0)
- , BlockAddress(0)
- , IsOpen(false)
- , Stream(NULL)
- , UncompressedBlock(NULL)
- , CompressedBlock(NULL)
- {
- try {
- CompressedBlock = new char[CompressedBlockSize];
- UncompressedBlock = new char[UncompressedBlockSize];
- } catch(bad_alloc&) {
- printf("ERROR: Unable to allocate memory for our BGZF object.\n");
- exit(1);
- }
+ //! \cond
+ // --------------------------------------------------------------------------------------
+ // static inline methods (internal - can exclude from main documentation)
+
+ // unpacks a buffer into an unsigned int
+ inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
}
- // destructor
- ~BgzfData(void) {
- if(CompressedBlock) delete [] CompressedBlock;
- if(UncompressedBlock) delete [] UncompressedBlock;
+ // unpacks a buffer into an unsigned short
+ inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
}
-};
-#endif // BGZF_DATA
-
-
-// --------------------------- //
-// BamIndex-related typedefs
-// --------------------------- //
-
-// offset for linear indexing
-typedef vector<uint64_t> LinearOffsetVector;
-
-// alignment 'chunk' boundaries
-typedef pair<uint64_t, uint64_t> ChunkPair;
-typedef vector<ChunkPair> ChunkVector;
-
-// BAM bins.. each contains (binID, alignment 'chunks')
-typedef pair<uint32_t, ChunkVector*> BamBin;
-typedef vector<BamBin> BinVector;
-// each reference sequence has a BinVector and LinearOffsetVector
-typedef pair<BinVector*, LinearOffsetVector*> RefIndex;
-
-// full BamIndex defined as:
-typedef vector<RefIndex*> BamIndex;
-
-class BamReader {
-
- // constructor/destructor
- public:
- BamReader(void);
- ~BamReader(void);
-
- // public interface
- public:
- // closes the BAM file
- void Close(void);
- // retrieves header text
- const string GetHeaderText(void) const;
- // saves the alignment to the alignment archive
- bool GetNextAlignment(BamAlignment& bAlignment);
- // return number of reference sequences in BAM file
- const int GetReferenceCount(void) const;
- // return vector of RefData entries
- const RefVector GetReferenceData(void) const;
- // get refID from reference name
- const int GetReferenceID(const string& refName) const;
- // jumps to 'left' position on refID
- bool Jump(int refID, unsigned int left = 0);
- // opens the BAM file
- void Open(const string& filename, const string& indexFilename = "");
- // move file pointer back to first alignment
- bool Rewind(void);
-
- // internal methods
- private:
- // checks BGZF block header
- bool BgzfCheckBlockHeader(char* header);
- // closes the BAM file
- void BgzfClose(void);
- // de-compresses the current block
- int BgzfInflateBlock(int blockLength);
- // opens the BAM file for reading
- void BgzfOpen(const string& filename);
- // reads BGZF data into a byte buffer
- unsigned int BgzfRead(char* data, const unsigned int dataLen);
- // reads BGZF block
- int BgzfReadBlock(void);
- // seek to position in BAM file
- bool BgzfSeek(int64_t position);
- // get file position in BAM file
- int64_t BgzfTell(void);
- // unpacks a buffer into an unsigned int
- static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
- // unpacks a buffer into an unsigned short
- static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
- // calculate bins that overlap region ( left to reference end for now )
- int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
- // calculates alignment end position based on starting position and provided CIGAR operations
- unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData);
- // clear out (delete pointers in) index data structure
- void ClearIndex(void);
- // calculate file offset for first alignment chunk overlapping 'left'
- int64_t GetOffset(int refID, unsigned int left);
- // checks to see if alignment overlaps current region
- bool IsOverlap(BamAlignment& bAlignment);
- // retrieves header text from BAM file
- void LoadHeaderData(void);
- // builds BamIndex data structure from BAM index file
- void LoadIndexData(FILE* indexStream);
- // retrieves BAM alignment under file pointer
- bool LoadNextAlignment(BamAlignment& bAlignment);
- // builds reference data structure from BAM file
- void LoadReferenceData(void);
- // open BAM index file (if successful, loads index)
- void OpenIndex(const string& indexFilename);
+ // --------------------------------------------------------------------------------------
+ // template classes/methods (internal - can exclude from main documentation)
- // aligment file & index file data
- private:
- BgzfData m_BGZF;
- string m_headerText;
- BamIndex* m_index;
- RefVector m_references;
- bool m_isIndexLoaded;
- int64_t m_alignmentsBeginOffset;
-
- // user-specified region values
- private:
- bool m_isRegionSpecified;
- int m_currentRefID;
- unsigned int m_currentLeft;
-
- // BAM character constants
- private:
- static const char* DNA_LOOKUP;
- static const char* CIGAR_LOOKUP;
-};
+ // allows sorting/searching of a vector of pairs (instead of using maps)
+ template <typename Key, typename Value>
+ class LookupKeyCompare {
-// unpacks a buffer into an unsigned int
-inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
- union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
- un.valueBuffer[0] = buffer[0];
- un.valueBuffer[1] = buffer[1];
- un.valueBuffer[2] = buffer[2];
- un.valueBuffer[3] = buffer[3];
- return un.value;
-}
-
-// unpacks a buffer into an unsigned short
-inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
- union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
- un.valueBuffer[0] = buffer[0];
- un.valueBuffer[1] = buffer[1];
- return un.value;
-}
-
-// allows sorting/searching of a vector of pairs (instead of using maps)
-template <typename Key, typename Value>
-class LookupKeyCompare {
+ typedef std::pair< Key, Value > LookupData;
+ typedef typename LookupData::first_type Key_t;
+
+ public:
+ bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
+ bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
+ bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
+ private:
+ bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
+ };
- typedef pair< Key, Value > LookupData;
- typedef typename LookupData::first_type Key_t;
+ // return index of item if found, else return container.size()
+ template < typename InputIterator, typename EqualityComparable >
+ typename std::iterator_traits<InputIterator>::difference_type
+ Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
+ return std::distance(begin, std::find(begin, end, item));
+ }
+ //! \endcond
- public:
- bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
- bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
- bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
- private:
- bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
-};
-
-// return index of item if found, else return container.size()
-template < typename InputIterator, typename EqualityComparable >
-typename iterator_traits<InputIterator>::difference_type
-Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
- return distance(begin, find(begin, end, item));
}
-
+++ /dev/null
-// BamReaderMain.cpp\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 12 May 2009\r
-\r
-#include "BamReader.h"\r
-\r
-// C++ includes\r
-#include <iostream>\r
-using std::cerr;\r
-using std::cout;\r
-using std::endl;\r
-\r
-#include <vector>\r
-using std::vector;\r
-\r
-#include <string>\r
-using std::string;\r
-\r
-int main(int argc, char* argv[]) {\r
-\r
- // check command line parameters\r
- if (argc != 3) {\r
- cerr << endl;\r
- cerr << "Usage: BamReaderTest <bamFile> <bamIndexFile>" << endl;\r
- cerr << endl;\r
- exit(1);\r
- }\r
-\r
- // get filenames from command line\r
- const char* bamFilename = argv[1];\r
- const char* bamIndexFilename = argv[2];\r
-\r
- int alignmentsRead = 0;\r
- BamAlignment bAlignment;\r
- \r
- BamReader bReader;\r
- cerr << endl << "Opening BAM file (and index)" << endl << endl;\r
- bReader.Open(bamFilename, bamIndexFilename);\r
-\r
- string header = bReader.GetHeaderText();\r
- cerr << "Printing header text..." << endl;\r
- if ( header.empty() ) {\r
- cerr << "No header provided." << endl << endl;\r
- } else {\r
- cerr << "----------------------" << endl;\r
- cerr << "Header Text: " << endl;\r
- cerr << "----------------------" << endl;\r
- cerr << header << endl << endl;\r
- }\r
-\r
- RefVector references = bReader.GetReferenceData();\r
- cerr << "Printing references..." << endl;\r
- RefVector::const_iterator refIter = references.begin();\r
- RefVector::const_iterator refEnd = references.end();\r
- for ( ; refIter != refEnd; ++refIter) {\r
- cerr << "Reference entry: " << endl;\r
- cerr << (*refIter).RefName << endl;\r
- cerr << (*refIter).RefLength << endl;\r
- cerr << "Has alignments? : " << ( ((*refIter).RefHasAlignments) ? "yes" : "no" ) << endl;\r
- }\r
- cerr << endl;\r
-\r
- alignmentsRead = 0;\r
- while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
- cerr << "Alignment " << alignmentsRead << endl;\r
- cerr << bAlignment.Name << endl;\r
- cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
- ++alignmentsRead;\r
- }\r
-\r
- cerr << "Jumping in BAM file" << endl;\r
- if ( bReader.Jump(1, 5000) ) {\r
- cerr << "Jumping seems ok - getting first 10 alignments that start at or after chr2:5000" << endl;\r
-\r
- alignmentsRead = 0;\r
- while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
- if ( bAlignment.Position >= 5000 ) { \r
- cerr << "Alignment " << alignmentsRead << endl;\r
- cerr << bAlignment.Name << endl;\r
- cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
- ++alignmentsRead;\r
- }\r
- }\r
- }\r
-\r
- cerr << "Rewinding BAM file" << endl;\r
- if ( bReader.Rewind() ) {\r
- cerr << "Rewind seems to be ok - getting first 10 alignments" << endl;\r
- \r
- alignmentsRead = 0;\r
- while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) {\r
- cerr << "Alignment " << alignmentsRead << endl;\r
- cerr << bAlignment.Name << endl;\r
- cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl;\r
- ++alignmentsRead;\r
- }\r
- }\r
-\r
- cerr << "Closing BAM file..." << endl << endl;\r
- bReader.Close();\r
-\r
- cerr << "Exiting..." << endl << endl;\r
- return 0;\r
-}\r
--- /dev/null
+// ***************************************************************************\r
+// BamTrimMain.cpp (c) 2009 Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 15 July 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Basic example of reading/writing BAM files. Pulls alignments overlapping \r
+// the range specified by user from one BAM file and writes to a new BAM file.\r
+// ***************************************************************************\r
+\r
+// Std C/C++ includes\r
+#include <cstdlib>\r
+#include <iostream>\r
+#include <string>\r
+using namespace std;\r
+\r
+// BamTools includes\r
+#include "BamReader.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+\r
+int main(int argc, char* argv[]) {\r
+\r
+ // validate argument count\r
+ if( argc != 7 ) {\r
+ cerr << "USAGE: " << argv[0] << " <input BAM file> <input BAM index file> <output BAM file> <reference name> <leftBound> <rightBound> " << endl;\r
+ exit(1);\r
+ }\r
+\r
+ // store arguments\r
+ string inBamFilename = argv[1];\r
+ string indexFilename = argv[2];\r
+ string outBamFilename = argv[3];\r
+ string referenceName = argv[4];\r
+ string leftBound_str = argv[5];\r
+ string rightBound_str = argv[6];\r
+\r
+ // open our BAM reader\r
+ BamReader reader;\r
+ reader.Open(inBamFilename, indexFilename);\r
+\r
+ // get header & reference information\r
+ string header = reader.GetHeaderText();\r
+ RefVector references = reader.GetReferenceData();\r
+ \r
+ // open our BAM writer\r
+ BamWriter writer;\r
+ writer.Open(outBamFilename, header, references);\r
+\r
+ // get reference ID from name\r
+ int refID = 0;\r
+ RefVector::const_iterator refIter = references.begin();\r
+ RefVector::const_iterator refEnd = references.end();\r
+ for ( ; refIter != refEnd; ++refIter ) {\r
+ if ( (*refIter).RefName == referenceName ) { break; }\r
+ ++refID;\r
+ }\r
+ \r
+ // validate ID\r
+ if ( refIter == refEnd ) {\r
+ cerr << "Reference: " << referenceName << " not found." << endl;\r
+ exit(1);\r
+ }\r
+\r
+ // convert boundary arguments to numeric values\r
+ unsigned int leftBound = (unsigned int) atoi( leftBound_str.c_str() );\r
+ unsigned int rightBound = (unsigned int) atoi( rightBound_str.c_str() );\r
+ \r
+ // attempt jump to range of interest\r
+ if ( reader.Jump(refID, leftBound) ) {\r
+ \r
+ // while data exists and alignment begin before right bound\r
+ BamAlignment bAlignment;\r
+ while ( reader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) {\r
+ // save alignment to archive\r
+ writer.SaveAlignment(bAlignment);\r
+ }\r
+ } \r
+ \r
+ // if jump failed\r
+ else {\r
+ cerr << "Could not jump to ref:pos " << referenceName << ":" << leftBound << endl;\r
+ exit(1);\r
+ }\r
+\r
+ // clean up and exit\r
+ reader.Close();\r
+ writer.Close(); \r
+ return 0;\r
+}
\ No newline at end of file
// ***************************************************************************
-// BamWriter (c) 2009 Michael Strömberg
-// Marth Lab, Deptartment of Biology, Boston College
+// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
// The BGZF routines were adapted from the bgzf.c code developed at the Broad
// Institute.
// ---------------------------------------------------------------------------
// ***************************************************************************
#include "BamWriter.h"
+using namespace BamTools;
+using namespace std;
// constructor
BamWriter::BamWriter(void)
// ***************************************************************************
-// BamWriter (c) 2009 Michael Strömberg
-// Marth Lab, Deptartment of Biology, Boston College
+// BamWriter.h (c) 2009 Michael Strömberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
+// Last modified: 24 June 2009 (DB)
+// ---------------------------------------------------------------------------
// The BGZF routines were adapted from the bgzf.c code developed at the Broad
// Institute.
// ---------------------------------------------------------------------------
// Provides the basic functionality for producing BAM files
// ***************************************************************************
+/*!
+ \file BamWriter.h
+ \brief API for writing BAM files.
+*/
+
#pragma once
+// C++ includes
#include <string>
#include <vector>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <zlib.h>
-#include "BamAlignment.h"
-
-using namespace std;
-
-// our zlib constants
-#define GZIP_ID1 31
-#define GZIP_ID2 139
-#define CM_DEFLATE 8
-#define FLG_FEXTRA 4
-#define OS_UNKNOWN 255
-#define BGZF_XLEN 6
-#define BGZF_ID1 66
-#define BGZF_ID2 67
-#define BGZF_LEN 2
-#define GZIP_WINDOW_BITS -15
-#define Z_DEFAULT_MEM_LEVEL 8
-
-// our BZGF constants
-#define BLOCK_HEADER_LENGTH 18
-#define BLOCK_FOOTER_LENGTH 8
-#define MAX_BLOCK_SIZE 65536
-#define DEFAULT_BLOCK_SIZE 65536
-
-// our BAM constants
-#define BAM_CORE_SIZE 32
-#define BAM_CMATCH 0
-#define BAM_CINS 1
-#define BAM_CDEL 2
-#define BAM_CREF_SKIP 3
-#define BAM_CSOFT_CLIP 4
-#define BAM_CHARD_CLIP 5
-#define BAM_CPAD 6
-#define BAM_CIGAR_SHIFT 4
-#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+// zlib includes
+#include <zlib.h>
-// our variable sizes
-#define SIZEOF_INT 4
+// BamTools includes
+#include "BamAux.h"
-// define our BZGF structure
-#ifndef BGZF_DATA
-#define BGZF_DATA
-struct BgzfData {
- unsigned int UncompressedBlockSize;
- unsigned int CompressedBlockSize;
- unsigned int BlockLength;
- unsigned int BlockOffset;
- uint64_t BlockAddress;
- bool IsOpen;
- FILE* Stream;
- char* UncompressedBlock;
- char* CompressedBlock;
+namespace BamTools {
- // constructor
- BgzfData(void)
- : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
- , CompressedBlockSize(MAX_BLOCK_SIZE)
- , BlockLength(0)
- , BlockOffset(0)
- , BlockAddress(0)
- , IsOpen(false)
- , Stream(NULL)
- , UncompressedBlock(NULL)
- , CompressedBlock(NULL)
- {
- try {
- CompressedBlock = new char[CompressedBlockSize];
- UncompressedBlock = new char[UncompressedBlockSize];
- } catch(bad_alloc&) {
- printf("ERROR: Unable to allocate memory for our BGZF object.\n");
- exit(1);
- }
- }
+ //! API for writing BAM files.
+ class BamWriter {
+
+ public:
+
+ //! Constructor.
+ BamWriter(void);
+
+ //! Destructor.
+ ~BamWriter(void);
+
+ public:
+
+ /*!
+ \brief Closes the alignment archive
+ \sa Open()
+ */
+ void Close(void);
+
+ /*!
+ \brief Opens the alignment archive
+ \param filename output BAM file
+ \param samHeader SAM-format header text
+ \param referenceSequences Reference sequence data
+ \sa Close()
+ */
+ void Open(const std::string& filename, const std::string& samHeader, const RefVector& referenceSequences);
+
+ /*!
+ \brief Saves an alignment to the archive
+ \param al The BamAlignment to be saved
+ */
+ void SaveAlignment(const BamAlignment& al);
+
+ // --------------------------------------------------------------------------------------
+ // internal methods
+ private:
+ // closes the BAM file
+ void BgzfClose(void);
+ // compresses the current block
+ int BgzfDeflateBlock(void);
+ // flushes the data in the BGZF block
+ void BgzfFlushBlock(void);
+ // opens the BAM file for writing
+ void BgzfOpen(const std::string& filename);
+ // packs an unsigned integer into the specified buffer
+ static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
+ // packs an unsigned short into the specified buffer
+ static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
+ // writes the supplied data into the BGZF buffer
+ unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
+ // calculates the minimum bin that contains a region [begin, end)
+ static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
+ // creates a packed cigar string from the supplied alignment
+ static void CreatePackedCigar(const std::vector<CigarOp>& cigarOperations, std::string& packedCigar);
+ // encodes the supplied query sequence into 4-bit notation
+ static void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
+ // our BGZF output object
+ BgzfData mBGZF;
+ };
- // destructor
- ~BgzfData(void) {
- if(CompressedBlock) delete [] CompressedBlock;
- if(UncompressedBlock) delete [] UncompressedBlock;
+ //! \cond
+ // --------------------------------------------------------------------------------------
+ // static inline methods (internal - can exclude from main documentation)
+
+ // packs an unsigned integer into the specified buffer
+ inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+ buffer[2] = (char)(value >> 16);
+ buffer[3] = (char)(value >> 24);
}
-};
-#endif // BGZF_DATA
-class BamWriter {
-public:
- // constructor
- BamWriter(void);
- // destructor
- ~BamWriter(void);
- // closes the alignment archive
- void Close(void);
- // opens the alignment archive
- void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
- // saves the alignment to the alignment archive
- void SaveAlignment(const BamAlignment& al);
-private:
- // closes the BAM file
- void BgzfClose(void);
- // compresses the current block
- int BgzfDeflateBlock(void);
- // flushes the data in the BGZF block
- void BgzfFlushBlock(void);
- // opens the BAM file for writing
- void BgzfOpen(const string& filename);
- // packs an unsigned integer into the specified buffer
- static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
// packs an unsigned short into the specified buffer
- static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
- // writes the supplied data into the BGZF buffer
- unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
- // calculates the minimum bin that contains a region [begin, end)
- static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
- // creates a packed cigar string from the supplied alignment
- static void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
- // encodes the supplied query sequence into 4-bit notation
- static void EncodeQuerySequence(const string& query, string& encodedQuery);
- // our BGZF output object
- BgzfData mBGZF;
-};
-
-// packs an unsigned integer into the specified buffer
-inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
- buffer[0] = (char)value;
- buffer[1] = (char)(value >> 8);
- buffer[2] = (char)(value >> 16);
- buffer[3] = (char)(value >> 24);
-}
-
-// packs an unsigned short into the specified buffer
-inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
- buffer[0] = (char)value;
- buffer[1] = (char)(value >> 8);
-}
+ inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+ }
+ // --------------------------------------------------------------------------------------
+ //! \endcond
+
+} // end BamTools namespace
\ No newline at end of file
CXX= g++\r
CXXFLAGS= -Wall -O3\r
-PROG= BamReaderTest BamConversion\r
+PROG= BamDump BamTrim\r
LIBS= -lz\r
\r
all: $(PROG)\r
\r
-BamReaderTest: BamReader.o BamReaderMain.o\r
- $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamReaderMain.o $(LIBS)\r
+BamDump: BamReader.o BamDumpMain.o\r
+ $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamDumpMain.o $(LIBS)\r
\r
-BamConversion: BamReader.o BamWriter.o BamConversionMain.o\r
- $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamConversionMain.o $(LIBS)\r
-\r
-BamMerge: BamReader.o BamWriter.o BamMerge.o\r
- $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamMerge.o $(LIBS)\r
+BamTrim: BamReader.o BamWriter.o BamTrimMain.o\r
+ $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamTrimMain.o $(LIBS)\r
\r
clean:\r
- rm -fr gmon.out *.o *.a a.out *~\r
+ rm -fr gmon.out *.o *.a a.out *~\r
----------------
-README
-----------------
-
-
-Significant re-write of BamReader API as of 5/18/2009:
--------------------------------------------------------
-No longer using bgzf.h/.c
-No longer using libbambc.a library
-No longer using STL Utilities
-Minor changes to API syntax in opening BAM files with BamReader - see BamReader.h for details
-
-
-Compile instructions
-----------------------
-Short version:
-See Makefile for compilation example.
-
-Longer version:
-Copy BamAlignment.h, BamReader.* and/or BamWriter.* to working directory.
-#include "BamReader.h" and/or "BamWriter.h"
-Use standard compile commands, including the "-lz" parameter to access zlib functionality.
-(Note that "-L. -lbambc" are no longer necessary and WILL CAUSE PROBLEMS for compiler/linker )
-
-
-Feel free to email me or pop by my desk with any questions, comments, suggestions, etc.
- - Derek
-
+------------------------------------------------------------
+ README
+------------------------------------------------------------
+
+BamTools: a C++ API for reading/writing BAM files.
+
+The API consists of 2 main modules - BamReader and BamWriter. As you would expect,
+BamReader provides read-access to BAM files, while BamWriter does the writing of BAM
+files. BamReader provides an interface for random-access (jumping) in a BAM file.
+
+An additional file, BamAux.h, is included as well.
+This file contains the common data structures and typedefs used throught the API.
+
+------------------------------------------------------------
+
+To use this API, you simply need to do 3 things:
+
+ 1 - Drop the BamTools files somewhere the compiler can find them.
+ (i.e. in source directory, or somewhere in include path)
+
+ 2 - Import BamTools API with the following lines of code
+
+ #include "BamReader.h" // as needed
+ #include "BamWriter.h" // as needed
+ using namespace BamTools;
+
+ 3 - Compile with '-lz' ('l' as in 'lion') to access ZLIB compression library
+
+See any included programs and Makefile for more specific compiling/usage examples.
+See documentation & comments in header files for API details.
+
+------------------------------------------------------------
+
+Feel free to contact me with any questions, comments, suggestions, bug reports, etc.
+ - Derek Barnett
+
+http://sourceforge.net/projects/bamtools