From: barnett Date: Wed, 15 Jul 2009 18:02:28 +0000 (+0000) Subject: Full update to SVN after combining BamReader and BamWriter into cohesive BamTools... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=b81cd242cdc5eb133a21e9d63f6d933766abc994;p=bamtools.git Full update to SVN after combining BamReader and BamWriter into cohesive BamTools API. git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@20 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- diff --git a/BamAlignment.h b/BamAlignment.h deleted file mode 100644 index c69229d..0000000 --- a/BamAlignment.h +++ /dev/null @@ -1,176 +0,0 @@ -// BamAlignment.h - -// Derek Barnett -// Marth Lab, Boston College -// Last modified: 20 March 2009 - -#ifndef BAMALIGNMENT_H -#define BAMALIGNMENT_H - -#include -#include - -#ifdef WIN32 -typedef char int8_t; -typedef unsigned char uint8_t; -typedef short int16_t; -typedef unsigned short uint16_t; -typedef int int32_t; -typedef unsigned int uint32_t; -typedef long long int64_t; -typedef unsigned long long uint64_t; -#else -#include -#endif - -// C++ includes -#include -using std::string; - -#include -using std::vector; - -struct CigarOp { - uint32_t Length; - char Type; -}; - -struct RefData { - string RefName; - unsigned int RefLength; - bool RefHasAlignments; - - // constructor - RefData(void) - : RefLength(0) - , RefHasAlignments(false) - { } -}; - -typedef vector RefVector; - -struct BamAlignment { - - // queries against alignment flag - see below for further detail -public: - bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } - bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } - bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } - bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } - bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } - bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } - bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } - bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } - bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } - bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } - - // returns true and assigns the read group if present in the tag data - bool GetReadGroup(string& readGroup) const { - - if(TagData.empty()) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); - unsigned int numBytesParsed = 0; - - bool foundReadGroupTag = false; - while(numBytesParsed < tagDataLen) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag - if(strncmp(pTagType, "RG", 2) == 0) { - foundReadGroupTag = true; - break; - } - - // get the storage class and find the next tag - SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed); - } - - // return if the read group tag was not present - if(!foundReadGroupTag) return false; - - // assign the read group - const unsigned int readGroupLen = strlen(pTagData); - readGroup.resize(readGroupLen); - memcpy((char*)readGroup.data(), pTagData, readGroupLen); - return true; - } - - // skips to the next tag - static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - switch(storageType) { - case 'A': - case 'c': - case 'C': - numBytesParsed++; - pTagData++; - break; - case 's': - case 'S': - case 'f': - numBytesParsed += 2; - pTagData += 2; - break; - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; - case 'Z': - case 'H': - while(*pTagData) { - numBytesParsed++; - pTagData++; - } - break; - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); - exit(1); - } - } - - // data members -public: - string Name; // read name - unsigned int Length; // query length - string QueryBases; // original sequence ( produced from machine ) - string AlignedBases; // aligned sequence ( with indels ) - string Qualities; // FASTQ qualities ( still in ASCII characters ) - string TagData; // contains the tag data (accessor methods will pull the requested information out) - unsigned int RefID; // ID for reference sequence - unsigned int Position; // position on reference sequence where alignment starts - unsigned int Bin; // bin in BAM file where this alignment resides - unsigned int MapQuality; // mapping quality - unsigned int AlignmentFlag; // see above for available queries - vector CigarData; // vector of CIGAR operations (length & type) ) - unsigned int MateRefID; // ID for reference sequence that mate was aligned to - unsigned int MatePosition; // position that mate was aligned to - unsigned int InsertSize; // mate pair insert size - - // alignment flag query constants -private: - enum { PAIRED = 1, // Alignment comes from paired-end data - PROPER_PAIR = 2, // Alignment passed paired-end resolution - UNMAPPED = 4, // Read is unmapped - MATE_UNMAPPED = 8, // Mate is unmapped - REVERSE = 16, // Read is on reverse strand - MATE_REVERSE = 32, // Mate is on reverse strand - READ_1 = 64, // This alignment is mate 1 of pair - READ_2 = 128, // This alignment is mate 2 of pair - SECONDARY = 256, // This alignment is not the primary (best) alignment for read - QC_FAILED = 512, // Read did not pass prior quality control steps - DUPLICATE = 1024 // Read is PCR duplicate - }; -}; - -// commonly used vector in this library -typedef vector< BamAlignment > BamAlignmentVector; - -#endif /* BAMALIGNMENT_H */ diff --git a/BamAux.h b/BamAux.h new file mode 100644 index 0000000..9c088dd --- /dev/null +++ b/BamAux.h @@ -0,0 +1,340 @@ +// *************************************************************************** +// BamAux.h (c) 2009 Derek Barnett, Michael Strömberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 24 June 2009 (DB) +// --------------------------------------------------------------------------- +// The BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Defines common constants, typedefs, & data structures for BamTools. +// *************************************************************************** + +/*! + \file BamAux.h + \brief BamTools constants, typedefs, & data structures +*/ + +#pragma once + +// C++ includes +#include +#include +#include +#include + +// C includes +#include +#include +#include + +// Platform-specific type definitions +#ifdef WIN32 + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; +#else + #include +#endif + +//! \namespace BamTools +namespace BamTools { + + //! \cond + // -------------------------------------------------------------------------------------- + // This section is purely internal and can be excluded from main generated documentation. + + // zlib constants + const int GZIP_ID1 = 31; + const int GZIP_ID2 = 139; + const int CM_DEFLATE = 8; + const int FLG_FEXTRA = 4; + const int OS_UNKNOWN = 255; + const int BGZF_XLEN = 6; + const int BGZF_ID1 = 66; + const int BGZF_ID2 = 67; + const int BGZF_LEN = 2; + const int GZIP_WINDOW_BITS = -15; + const int Z_DEFAULT_MEM_LEVEL = 8; + + // BZGF constants + const int BLOCK_HEADER_LENGTH = 18; + const int BLOCK_FOOTER_LENGTH = 8; + const int MAX_BLOCK_SIZE = 65536; + const int DEFAULT_BLOCK_SIZE = 65536; + + // BAM constants + const int BAM_CORE_SIZE = 32; + const int BAM_CMATCH = 0; + const int BAM_CINS = 1; + const int BAM_CDEL = 2; + const int BAM_CREF_SKIP = 3; + const int BAM_CSOFT_CLIP = 4; + const int BAM_CHARD_CLIP = 5; + const int BAM_CPAD = 6; + const int BAM_CIGAR_SHIFT = 4; + const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); + + // BAM index constants + const int MAX_BIN = 37450; // =(8^6-1)/7+1 + const int BAM_MIN_CHUNK_GAP = 32768; + const int BAM_LIDX_SHIFT = 14; + + // Explicit variable sizes + const int SIZEOF_INT = 4; + + struct BgzfData { + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; + + // constructor + BgzfData(void) + : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) + { + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch( std::bad_alloc& ba ) { + printf("ERROR: Unable to allocate memory for our BGZF object.\n"); + exit(1); + } + } + + // destructor + ~BgzfData(void) { + if(CompressedBlock) delete [] CompressedBlock; + if(UncompressedBlock) delete [] UncompressedBlock; + } + }; + //! \endcond + + // -------------------------------------------------------------------------------------- + // Data structures + + //! \brief Cigar operation data structure + struct CigarOp { + char Type; //!< Operation type (MIDNSHP) + uint32_t Length; //!< Operation length (number of bases) + + }; + + //! Reference sequence data structure + struct RefData { + std::string RefName; //!< Name of reference sequence + unsigned int RefLength; //!< Length of reference sequence + bool RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence + + // constructor + RefData(void) + : RefLength(0) + , RefHasAlignments(false) + { } + }; + + //! BAM alignment data structure + struct BamAlignment { + + // Queries against alignment flag + public: + //! Returns true if this read is a PCR duplicate (determined by external app) + bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } + //! Returns true if this read failed quality control (determined by external app) + bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } + //! Returns true if alignment is first mate on read + bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } + //! Returns true if alignment is mapped + bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } + //! Returns true if alignment's mate is mapped + bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } + //! Returns true if alignment's mate mapped to reverse strand + bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } + //! Returns true if alignment part of paired-end read + bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } + //! Returns true if this position is primary alignment (determined by external app) + bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } + //! Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app) + bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } + //! Returns true if alignment mapped to reverse strand + bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } + //! Returns true if alignment is second mate on read + bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + + public: + /*! + \brief Get alignment's read group text. + + Assigns read group text to readGroup. + + \return True if read group data successfully retrieved. + */ + bool GetReadGroup(std::string& readGroup) const { + + if ( TagData.empty() ) { return false; } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLen = TagData.size(); + unsigned int numBytesParsed = 0; + + bool foundReadGroupTag = false; + while( numBytesParsed < tagDataLen ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag + if ( strncmp(pTagType, "RG", 2) == 0 ) { + foundReadGroupTag = true; + break; + } + + // get the storage class and find the next tag + SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); + } + + // return if the read group tag was not present + if ( !foundReadGroupTag ) { return false; } + + // assign the read group + const unsigned int readGroupLen = strlen(pTagData); + readGroup.resize(readGroupLen); + memcpy( (char*)readGroup.data(), pTagData, readGroupLen ); + return true; + } + + private: + // skips to the next tag + static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { + switch(storageType) { + + case 'A': + case 'c': + case 'C': + ++numBytesParsed; + ++pTagData; + break; + + case 's': + case 'S': + case 'f': + numBytesParsed += 2; + pTagData += 2; + break; + + case 'i': + case 'I': + numBytesParsed += 4; + pTagData += 4; + break; + + case 'Z': + case 'H': + while(*pTagData) { + ++numBytesParsed; + ++pTagData; + } + break; + + default: + printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); + exit(1); + } + } + + // Data members + public: + std::string Name; //!< Read name + unsigned int Length; //!< Query length + std::string QueryBases; //!< 'Original' sequence (as reported from sequencing machine) + std::string AlignedBases; //!< 'Aligned' sequence (includes any indels, padding, clipping) + std::string Qualities; //!< FASTQ qualities (ASCII characters, not numeric values) + std::string TagData; //!< Tag data (accessor methods will pull the requested information out) + unsigned int RefID; //!< ID number for reference sequence + unsigned int Position; //!< Position (0-based) where alignment starts + unsigned int Bin; //!< Bin in BAM file where this alignment resides + unsigned int MapQuality; //!< Mapping quality score + unsigned int AlignmentFlag; //!< Alignment bit-flag - see Is() methods for available queries + std::vector CigarData; //!< CIGAR operations for this alignment + unsigned int MateRefID; //!< ID number for reference sequence where alignment's mate was aligned + unsigned int MatePosition; //!< Position (0-based) where alignment's mate starts + unsigned int InsertSize; //!< Mate-pair insert size + + // Alignment flag query constants + private: + enum { PAIRED = 1, + PROPER_PAIR = 2, + UNMAPPED = 4, + MATE_UNMAPPED = 8, + REVERSE = 16, + MATE_REVERSE = 32, + READ_1 = 64, + READ_2 = 128, + SECONDARY = 256, + QC_FAILED = 512, + DUPLICATE = 1024 + }; + }; + + // ---------------------------------------------------------------- + // Typedefs + + /*! + \typedef RefVector + \brief Vector of RefData objects + */ + typedef std::vector RefVector; + + /*! + \typedef BamAlignmentVector + \brief Vector of BamAlignments + */ + typedef std::vector< BamAlignment > BamAlignmentVector; + + //! \cond + // ---------------------------------------------------------------- + // Typedefs (internal - can exclude from main documentation) + + //Offsets for linear indexing + typedef std::vector LinearOffsetVector; + + // Alignment 'chunk' boundaries + typedef std::pair ChunkPair; + // Vector of alignment 'chunks' + typedef std::vector ChunkVector; + + // BAM bin contains a bin ID & a vector of alignment 'chunks' + typedef std::pair BamBin; + // Vector of BAM bins + typedef std::vector BinVector; + + // Reference sequence index data + typedef std::pair RefIndex; + + // Full BAM file index data structure + typedef std::vector BamIndex; + // ---------------------------------------------------------------- + //! \endcond +} diff --git a/BamConversionMain.cpp b/BamConversionMain.cpp deleted file mode 100644 index 0e6333b..0000000 --- a/BamConversionMain.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include -#include "BamReader.h" -#include "BamWriter.h" - -using namespace std; - -int main(int argc, char* argv[]) { - - if(argc != 3) { - cout << "USAGE: " << argv[0] << " " << endl; - exit(1); - } - - // localize our arguments - const char* inputFilename = argv[1]; - const char* outputFilename = argv[2]; - - // open our BAM reader - BamReader reader; - reader.Open(inputFilename); - - // retrieve the SAM header text - string samHeader = reader.GetHeaderText(); - - // retrieve the reference sequence vector - RefVector referenceSequences = reader.GetReferenceData(); - - // open the BAM writer - BamWriter writer; - writer.Open(outputFilename, samHeader, referenceSequences); - - // copy all of the reads from the input file to the output file - BamAlignment al; - while(reader.GetNextAlignment(al)) writer.SaveAlignment(al); - - // close our files - reader.Close(); - writer.Close(); - - return 0; -} diff --git a/BamDumpMain.cpp b/BamDumpMain.cpp new file mode 100644 index 0000000..f76852a --- /dev/null +++ b/BamDumpMain.cpp @@ -0,0 +1,55 @@ +// *************************************************************************** +// BamDump.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 15 July 2009 (DB) +// --------------------------------------------------------------------------- +// Spits out all alignments in BAM file. +// +// N.B. - Could result in HUGE text file. This is mostly a debugging tool +// for small files. You have been warned. +// *************************************************************************** + +// Std C/C++ includes +#include +#include +#include +using namespace std; + +// BamTools includes +#include "BamReader.h" +#include "BamWriter.h" +using namespace BamTools; + +void PrintAlignment(const BamAlignment&); + +int main(int argc, char* argv[]) { + + // validate argument count + if( argc != 2 ) { + cerr << "USAGE: " << argv[0] << " " << endl; + exit(1); + } + + string filename = argv[1]; + cout << "Printing alignments from file: " << filename << endl; + + BamReader reader; + reader.Open(filename); + + BamAlignment bAlignment; + while (reader.GetNextAlignment(bAlignment)) { + PrintAlignment(bAlignment); + } + + reader.Close(); + return 0; +} + +// Spit out basic BamAlignment data +void PrintAlignment(const BamAlignment& alignment) { + cout << "---------------------------------" << endl; + cout << "Name: " << alignment.Name << endl; + cout << "Aligned to: " << alignment.RefID << ":" << alignment.Position << endl; +} diff --git a/BamMerge.cpp b/BamMerge.cpp deleted file mode 100644 index bfa7908..0000000 --- a/BamMerge.cpp +++ /dev/null @@ -1,206 +0,0 @@ - -#include -#include -#include "BamReader.h" -#include "BamWriter.h" - -using namespace std; - -int main(int argc, char* argv[]) { - - for (int a=0; a []" << endl; - exit(1); - } - - // localize our arguments - const char* inputFilename1 = argv[1]; - const char* inputFilename2 = argv[2]; - const char* outputFilename = argv[3]; - unsigned int Nmax=0; - if (argc>4) { - const char* cmaxRead = argv[4]; - Nmax = atoi(cmaxRead); - } - - - // open our BAM reader1 for input file 1 - BamReader reader1; - reader1.SetFilename(inputFilename1); - - // open our BAM reader2 for input file 2 - BamReader reader2; - reader2.SetFilename(inputFilename2); - - // check reader1 - if(!reader1.Open()) { - cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl; - exit(1); - } - - // check reader2 - if(!reader2.Open()) { - cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl; - exit(1); - } - - // retrieve the header text from both files - string samHeader1 = reader1.GetHeaderText(); - string samHeader2 = reader2.GetHeaderText(); - - // retrieve the reference sequence vectors - RefVector referenceSequences1 = reader1.GetReferenceData(); - RefVector referenceSequences2 = reader2.GetReferenceData(); - - // merged referenceSequences - RefVector referenceSequences12; - - // process reference sequences from file 1 - map > refLengthMap; - map > refIdMap; - map > refDataMap; - - cerr << "Reference list from file 1" << endl; - unsigned int refCounter = 0; - for (RefVector::const_iterator refIter = referenceSequences1.begin(); efIter != referenceSequences1.end(); refIter++) { - - // retrieve - RefData rd = *refIter; - - // get member data - string refName = rd.RefName; - unsigned int refLength = rd.RefLength; - int refId = reader1.GetRefID(refName); - - // report - cerr << " refName=" << refName << " refId=" << refId << endl; - - // store in maps - refLengthMap[refName] = refLength; - refIdMap[refName] = refCounter; - refDataMap[refName] = rd; - - // add this to merged refVector - referenceSequences12.push_back(rd); - - // increment ref count - refCounter++; - } - - // map file 2 refId to merged refID - map > mapRefId12; - - cerr << "Reference list from file 2" << endl; - for (RefVector::const_iterator refIter = referenceSequences2.begin(); refIter != referenceSequences2.end(); refIter++) { - - // retrieve - RefData rd = *refIter; - - // get member data - string refName = rd.RefName; - unsigned int refLength = rd.RefLength; - int refId = reader2.GetRefID(refName); - - // report - cerr << " refName=" << refName << " refId=" << refId << endl; - - // if refName already in map, check ref length - if (refLengthMap.count(refName) > 0) { - - // check if length is the same - if (refLengthMap[refName] != refLength) { - cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl; - exit(1); - } - - // make ref id recoding entry - unsigned int refIdNew = refIdMap[refName]; - mapRefId12[refId] = refIdNew; - - } else { - // otherwise make new refId and RefData - // store in maps - refLengthMap[refName] = refLength; - refIdMap[refName] = refCounter; - refDataMap[refName] = rd; - - // make ref id recoding entry - mapRefId12[refId] = refCounter; - - // add this to merged refVector - referenceSequences12.push_back(rd); - - // increment ref count - refCounter++; - } - } - - cerr << "Length of referenceSequences vector=" << referenceSequences12.size() << endl; - - // open output bam file and write: - // empty header - string samHeader3 = ""; - - BamWriter writer; - writer.Open(outputFilename, samHeader3, referenceSequences12); - - // iterate through alignments from file 1 and write them unchanged - BamAlignment ba; - - bool keepgoing=true; - unsigned int ac1 = 0; - while(reader1.GetNextAlignment(ba) && keepgoing ) { - writer.SaveAlignment(ba); - ac1++; - keepgoing=(Nmax<1)||(ac1 - -#include -using std::cerr; -using std::endl; +using namespace BamTools; +using namespace std; // static character constants const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; @@ -31,8 +32,6 @@ BamReader::BamReader(void) // destructor BamReader::~BamReader(void) { - m_headerText.clear(); - ClearIndex(); Close(); } @@ -46,7 +45,8 @@ bool BamReader::BgzfCheckBlockHeader(char* header) { BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN && header[12] == BGZF_ID1 && header[13] == BGZF_ID2 && - BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN); + BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN + ); } // closes the BAM file @@ -94,7 +94,7 @@ void BamReader::BgzfOpen(const string& filename) { m_BGZF.Stream = fopen(filename.c_str(), "rb"); if(!m_BGZF.Stream) { - cerr << "ERROR: Unable to open the BAM file " << filename << "for reading." << endl; + printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() ); exit(1); } @@ -254,23 +254,27 @@ void BamReader::ClearIndex(void) { BinVector::iterator binEnd = (aRef->first)->end(); for ( ; binIter != binEnd; ++binIter ) { ChunkVector* chunks = (*binIter).second; - if ( chunks ) { delete chunks; } + if ( chunks ) { delete chunks; chunks = NULL;} } delete aRef->first; + aRef->first = NULL; } // clear BAM linear offsets - if ( aRef->second ) { delete aRef->second; } + if ( aRef->second ) { delete aRef->second; aRef->second = NULL; } delete aRef; + aRef = NULL; } } delete m_index; + m_index = NULL; } } // closes the BAM file void BamReader::Close(void) { - if(m_BGZF.IsOpen) { BgzfClose(); } - if(m_index) { delete m_index; m_index = NULL; } + if(m_BGZF.IsOpen) { BgzfClose(); } + ClearIndex(); + m_headerText.clear(); m_isRegionSpecified = false; } @@ -382,12 +386,12 @@ bool BamReader::IsOverlap(BamAlignment& bAlignment) { return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft ); } -bool BamReader::Jump(int refID, unsigned int left) { +bool BamReader::Jump(int refID, unsigned int position) { // if index available, and region is valid - if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (left <= m_references.at(refID).RefLength) ) { + if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) { m_currentRefID = refID; - m_currentLeft = left; + m_currentLeft = position; m_isRegionSpecified = true; int64_t offset = GetOffset(m_currentRefID, m_currentLeft); @@ -629,8 +633,10 @@ bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) { case ('S') : k += op.Length; // for 'S' - skip over query bases break; - case ('D') : - case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'D', 'P' - write padding; + case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character + break; + + case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character; break; case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence @@ -711,7 +717,7 @@ void BamReader::OpenIndex(const string& indexFilename) { // abort on error if(!indexStream) { - cerr << "ERROR: Unable to open the BAM index file " << indexFilename << "for reading." << endl; + printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() ); exit(1); } diff --git a/BamReader.h b/BamReader.h index 17cb86a..d829891 100644 --- a/BamReader.h +++ b/BamReader.h @@ -1,259 +1,274 @@ // *************************************************************************** -// BamReader (c) 2009 Derek Barnett -// Marth Lab, Deptartment of Biology, Boston College +// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg +// Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- +// Last modified: 24 June 2009 (DB) +// --------------------------------------------------------------------------- +// The BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** -#pragma once +/*! + \file BamReader.h + \brief API for reading BAM files. +*/ -#include -#include -#include -#include +#pragma once +// C++ includes #include +#include #include #include #include -using namespace std; - -#include "BamAlignment.h" - -// our zlib constants -#define GZIP_ID1 31 -#define GZIP_ID2 139 -#define CM_DEFLATE 8 -#define FLG_FEXTRA 4 -#define OS_UNKNOWN 255 -#define BGZF_XLEN 6 -#define BGZF_ID1 66 -#define BGZF_ID2 67 -#define BGZF_LEN 2 -#define GZIP_WINDOW_BITS -15 -#define Z_DEFAULT_MEM_LEVEL 8 - -// our BZGF constants -#define BLOCK_HEADER_LENGTH 18 -#define BLOCK_FOOTER_LENGTH 8 -#define MAX_BLOCK_SIZE 65536 -#define DEFAULT_BLOCK_SIZE 65536 -// our BAM constants -#define BAM_CORE_SIZE 32 -#define BAM_CMATCH 0 -#define BAM_CINS 1 -#define BAM_CDEL 2 -#define BAM_CREF_SKIP 3 -#define BAM_CSOFT_CLIP 4 -#define BAM_CHARD_CLIP 5 -#define BAM_CPAD 6 -#define BAM_CIGAR_SHIFT 4 -#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) +// zlib includes +#include -// BAM indexing constants -#define MAX_BIN 37450 // =(8^6-1)/7+1 -#define BAM_MIN_CHUNK_GAP 32768 -#define BAM_LIDX_SHIFT 14 +// BamTools includes +#include "BamAux.h" -// our variable sizes -#define SIZEOF_INT 4 +namespace BamTools { -// define our BZGF structure -#ifndef BGZF_DATA -#define BGZF_DATA -struct BgzfData { - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; + //! API for reading BAM files. + class BamReader { + + public: + + //! Constructor + BamReader(void); + + //! Destructor + ~BamReader(void); + + public: + + /*! + \brief Closes the BAM file. + + Also closes index file and clears index data, if provided. + + \sa Open() + */ + void Close(void); + + /*! + \brief Access SAM format header data. + + See SAM format documentation for detailed header description. + + \return Full header text (no parsing of tags) + */ + const std::string GetHeaderText(void) const; + + /*! + \brief Retrieve next alignment. + + Stores result in bAlignment. + + If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either: + a) overlaps, or + b) begins on/after that specified position. + + Otherwise this simply stores next alignment, if one exists. + + Note that this method does not specifiy a 'right bound' position. + If a range is desired, you should supply some stopping criteria. For example: + + \code + BamReader bReader; + bReader.Open(bamFile, bamIndexfile); + if ( bReader.Jump( someID, somePosition ) { + BamAlignment bAlignment; + while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) { + // do something + } + } + \endcode + + \param bAlignment destination for alignment data + \return success/failure + \sa Jump(), Rewind() + */ + bool GetNextAlignment(BamAlignment& bAlignment); + + /*! + \brief Get number of reference sequences in BAM file. + \return Number of references + \sa GetReferenceData(), GetReferenceID() + */ + const int GetReferenceCount(void) const; + + /*! + \brief Access reference data. + \return Vector of RefData entry + \sa GetReferenceCount(), GetReferenceID() + */ + const RefVector GetReferenceData(void) const; + + /*! + \brief Get reference ID from name. + \param refName name of reference sequence + \return reference ID number + \sa GetReferenceCount(), GetReferenceData() + */ + const int GetReferenceID(const std::string& refName) const; + + /*! + \brief Random access in BAM file. + + Jump to a specified position on reference sequence. + Position is optional - defaults to beginning of specified reference. + + Reference and position are stored for use by subsequent calls to GetNextAlignment(). + + \param refID ID number of desired reference + \param position left-bound position + \return success/failure + \sa GetNextAlignment(), Rewind() + */ + bool Jump(int refID, unsigned int position = 0); + + /*! + \brief Opens a BAM file. + + Index file is optional - sequential reading through a BAM file does not require an index. + + However, the index is required to perform random-access alignment retrival (using the Jump() method ). + + See SAMtools documentation for BAM index generation. + + \param filename BAM file + \param indexFilename BAM index file + \sa Jump(), Close() + */ + void Open(const std::string& filename, const std::string& indexFilename = ""); + + /*! + \brief Moves file pointer to beginning of alignment data. + + A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file. + Clears any reference and position set by a prior call to Jump() + + \return success/failure + \sa GetNextAlignment(), Jump() + */ + bool Rewind(void); + + // -------------------------------------------------------------------------------------- + // internal methods + private: + // checks BGZF block header + bool BgzfCheckBlockHeader(char* header); + // closes the BAM file + void BgzfClose(void); + // de-compresses the current block + int BgzfInflateBlock(int blockLength); + // opens the BAM file for reading + void BgzfOpen(const std::string& filename); + // reads BGZF data into a byte buffer + unsigned int BgzfRead(char* data, const unsigned int dataLen); + // reads BGZF block + int BgzfReadBlock(void); + // seek to position in BAM file + bool BgzfSeek(int64_t position); + // get file position in BAM file + int64_t BgzfTell(void); + // unpacks a buffer into an unsigned int + static inline unsigned int BgzfUnpackUnsignedInt(char* buffer); + // unpacks a buffer into an unsigned short + static inline unsigned short BgzfUnpackUnsignedShort(char* buffer); + // calculate bins that overlap region ( left to reference end for now ) + int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); + // calculates alignment end position based on starting position and provided CIGAR operations + unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector& cigarData); + // clear out (delete pointers in) index data structure + void ClearIndex(void); + // calculate file offset for first alignment chunk overlapping 'left' + int64_t GetOffset(int refID, unsigned int left); + // checks to see if alignment overlaps current region + bool IsOverlap(BamAlignment& bAlignment); + // retrieves header text from BAM file + void LoadHeaderData(void); + // builds BamIndex data structure from BAM index file + void LoadIndexData(FILE* indexStream); + // retrieves BAM alignment under file pointer + bool LoadNextAlignment(BamAlignment& bAlignment); + // builds reference data structure from BAM file + void LoadReferenceData(void); + // open BAM index file (if successful, loads index) + void OpenIndex(const std::string& indexFilename); + + // aligment file & index file data + private: + BgzfData m_BGZF; + std::string m_headerText; + BamIndex* m_index; + RefVector m_references; + bool m_isIndexLoaded; + int64_t m_alignmentsBeginOffset; + + // user-specified region values + private: + bool m_isRegionSpecified; + int m_currentRefID; + unsigned int m_currentLeft; + + // BAM character constants + private: + static const char* DNA_LOOKUP; + static const char* CIGAR_LOOKUP; + }; - // constructor - BgzfData(void) - : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(MAX_BLOCK_SIZE) - , BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) - { - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch(bad_alloc&) { - printf("ERROR: Unable to allocate memory for our BGZF object.\n"); - exit(1); - } + //! \cond + // -------------------------------------------------------------------------------------- + // static inline methods (internal - can exclude from main documentation) + + // unpacks a buffer into an unsigned int + inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; } - // destructor - ~BgzfData(void) { - if(CompressedBlock) delete [] CompressedBlock; - if(UncompressedBlock) delete [] UncompressedBlock; + // unpacks a buffer into an unsigned short + inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; } -}; -#endif // BGZF_DATA - - -// --------------------------- // -// BamIndex-related typedefs -// --------------------------- // - -// offset for linear indexing -typedef vector LinearOffsetVector; - -// alignment 'chunk' boundaries -typedef pair ChunkPair; -typedef vector ChunkVector; - -// BAM bins.. each contains (binID, alignment 'chunks') -typedef pair BamBin; -typedef vector BinVector; -// each reference sequence has a BinVector and LinearOffsetVector -typedef pair RefIndex; - -// full BamIndex defined as: -typedef vector BamIndex; - -class BamReader { - - // constructor/destructor - public: - BamReader(void); - ~BamReader(void); - - // public interface - public: - // closes the BAM file - void Close(void); - // retrieves header text - const string GetHeaderText(void) const; - // saves the alignment to the alignment archive - bool GetNextAlignment(BamAlignment& bAlignment); - // return number of reference sequences in BAM file - const int GetReferenceCount(void) const; - // return vector of RefData entries - const RefVector GetReferenceData(void) const; - // get refID from reference name - const int GetReferenceID(const string& refName) const; - // jumps to 'left' position on refID - bool Jump(int refID, unsigned int left = 0); - // opens the BAM file - void Open(const string& filename, const string& indexFilename = ""); - // move file pointer back to first alignment - bool Rewind(void); - - // internal methods - private: - // checks BGZF block header - bool BgzfCheckBlockHeader(char* header); - // closes the BAM file - void BgzfClose(void); - // de-compresses the current block - int BgzfInflateBlock(int blockLength); - // opens the BAM file for reading - void BgzfOpen(const string& filename); - // reads BGZF data into a byte buffer - unsigned int BgzfRead(char* data, const unsigned int dataLen); - // reads BGZF block - int BgzfReadBlock(void); - // seek to position in BAM file - bool BgzfSeek(int64_t position); - // get file position in BAM file - int64_t BgzfTell(void); - // unpacks a buffer into an unsigned int - static inline unsigned int BgzfUnpackUnsignedInt(char* buffer); - // unpacks a buffer into an unsigned short - static inline unsigned short BgzfUnpackUnsignedShort(char* buffer); - // calculate bins that overlap region ( left to reference end for now ) - int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); - // calculates alignment end position based on starting position and provided CIGAR operations - unsigned int CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData); - // clear out (delete pointers in) index data structure - void ClearIndex(void); - // calculate file offset for first alignment chunk overlapping 'left' - int64_t GetOffset(int refID, unsigned int left); - // checks to see if alignment overlaps current region - bool IsOverlap(BamAlignment& bAlignment); - // retrieves header text from BAM file - void LoadHeaderData(void); - // builds BamIndex data structure from BAM index file - void LoadIndexData(FILE* indexStream); - // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); - // builds reference data structure from BAM file - void LoadReferenceData(void); - // open BAM index file (if successful, loads index) - void OpenIndex(const string& indexFilename); + // -------------------------------------------------------------------------------------- + // template classes/methods (internal - can exclude from main documentation) - // aligment file & index file data - private: - BgzfData m_BGZF; - string m_headerText; - BamIndex* m_index; - RefVector m_references; - bool m_isIndexLoaded; - int64_t m_alignmentsBeginOffset; - - // user-specified region values - private: - bool m_isRegionSpecified; - int m_currentRefID; - unsigned int m_currentLeft; - - // BAM character constants - private: - static const char* DNA_LOOKUP; - static const char* CIGAR_LOOKUP; -}; + // allows sorting/searching of a vector of pairs (instead of using maps) + template + class LookupKeyCompare { -// unpacks a buffer into an unsigned int -inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// unpacks a buffer into an unsigned short -inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -// allows sorting/searching of a vector of pairs (instead of using maps) -template -class LookupKeyCompare { + typedef std::pair< Key, Value > LookupData; + typedef typename LookupData::first_type Key_t; + + public: + bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); } + bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); } + bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); } + private: + bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; } + }; - typedef pair< Key, Value > LookupData; - typedef typename LookupData::first_type Key_t; + // return index of item if found, else return container.size() + template < typename InputIterator, typename EqualityComparable > + typename std::iterator_traits::difference_type + Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) { + return std::distance(begin, std::find(begin, end, item)); + } + //! \endcond - public: - bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); } - bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); } - bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); } - private: - bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; } -}; - -// return index of item if found, else return container.size() -template < typename InputIterator, typename EqualityComparable > -typename iterator_traits::difference_type -Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) { - return distance(begin, find(begin, end, item)); } - diff --git a/BamReaderMain.cpp b/BamReaderMain.cpp deleted file mode 100644 index d568fed..0000000 --- a/BamReaderMain.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// BamReaderMain.cpp - -// Derek Barnett -// Marth Lab, Boston College -// Last modified: 12 May 2009 - -#include "BamReader.h" - -// C++ includes -#include -using std::cerr; -using std::cout; -using std::endl; - -#include -using std::vector; - -#include -using std::string; - -int main(int argc, char* argv[]) { - - // check command line parameters - if (argc != 3) { - cerr << endl; - cerr << "Usage: BamReaderTest " << endl; - cerr << endl; - exit(1); - } - - // get filenames from command line - const char* bamFilename = argv[1]; - const char* bamIndexFilename = argv[2]; - - int alignmentsRead = 0; - BamAlignment bAlignment; - - BamReader bReader; - cerr << endl << "Opening BAM file (and index)" << endl << endl; - bReader.Open(bamFilename, bamIndexFilename); - - string header = bReader.GetHeaderText(); - cerr << "Printing header text..." << endl; - if ( header.empty() ) { - cerr << "No header provided." << endl << endl; - } else { - cerr << "----------------------" << endl; - cerr << "Header Text: " << endl; - cerr << "----------------------" << endl; - cerr << header << endl << endl; - } - - RefVector references = bReader.GetReferenceData(); - cerr << "Printing references..." << endl; - RefVector::const_iterator refIter = references.begin(); - RefVector::const_iterator refEnd = references.end(); - for ( ; refIter != refEnd; ++refIter) { - cerr << "Reference entry: " << endl; - cerr << (*refIter).RefName << endl; - cerr << (*refIter).RefLength << endl; - cerr << "Has alignments? : " << ( ((*refIter).RefHasAlignments) ? "yes" : "no" ) << endl; - } - cerr << endl; - - alignmentsRead = 0; - while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) { - cerr << "Alignment " << alignmentsRead << endl; - cerr << bAlignment.Name << endl; - cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl; - ++alignmentsRead; - } - - cerr << "Jumping in BAM file" << endl; - if ( bReader.Jump(1, 5000) ) { - cerr << "Jumping seems ok - getting first 10 alignments that start at or after chr2:5000" << endl; - - alignmentsRead = 0; - while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) { - if ( bAlignment.Position >= 5000 ) { - cerr << "Alignment " << alignmentsRead << endl; - cerr << bAlignment.Name << endl; - cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl; - ++alignmentsRead; - } - } - } - - cerr << "Rewinding BAM file" << endl; - if ( bReader.Rewind() ) { - cerr << "Rewind seems to be ok - getting first 10 alignments" << endl; - - alignmentsRead = 0; - while ( bReader.GetNextAlignment(bAlignment) && (alignmentsRead <= 10) ) { - cerr << "Alignment " << alignmentsRead << endl; - cerr << bAlignment.Name << endl; - cerr << "Aligned to ref " << bAlignment.RefID << " at position " << bAlignment.Position << endl; - ++alignmentsRead; - } - } - - cerr << "Closing BAM file..." << endl << endl; - bReader.Close(); - - cerr << "Exiting..." << endl << endl; - return 0; -} diff --git a/BamTrimMain.cpp b/BamTrimMain.cpp new file mode 100644 index 0000000..7349115 --- /dev/null +++ b/BamTrimMain.cpp @@ -0,0 +1,91 @@ +// *************************************************************************** +// BamTrimMain.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 15 July 2009 (DB) +// --------------------------------------------------------------------------- +// Basic example of reading/writing BAM files. Pulls alignments overlapping +// the range specified by user from one BAM file and writes to a new BAM file. +// *************************************************************************** + +// Std C/C++ includes +#include +#include +#include +using namespace std; + +// BamTools includes +#include "BamReader.h" +#include "BamWriter.h" +using namespace BamTools; + +int main(int argc, char* argv[]) { + + // validate argument count + if( argc != 7 ) { + cerr << "USAGE: " << argv[0] << " " << endl; + exit(1); + } + + // store arguments + string inBamFilename = argv[1]; + string indexFilename = argv[2]; + string outBamFilename = argv[3]; + string referenceName = argv[4]; + string leftBound_str = argv[5]; + string rightBound_str = argv[6]; + + // open our BAM reader + BamReader reader; + reader.Open(inBamFilename, indexFilename); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector references = reader.GetReferenceData(); + + // open our BAM writer + BamWriter writer; + writer.Open(outBamFilename, header, references); + + // get reference ID from name + int refID = 0; + RefVector::const_iterator refIter = references.begin(); + RefVector::const_iterator refEnd = references.end(); + for ( ; refIter != refEnd; ++refIter ) { + if ( (*refIter).RefName == referenceName ) { break; } + ++refID; + } + + // validate ID + if ( refIter == refEnd ) { + cerr << "Reference: " << referenceName << " not found." << endl; + exit(1); + } + + // convert boundary arguments to numeric values + unsigned int leftBound = (unsigned int) atoi( leftBound_str.c_str() ); + unsigned int rightBound = (unsigned int) atoi( rightBound_str.c_str() ); + + // attempt jump to range of interest + if ( reader.Jump(refID, leftBound) ) { + + // while data exists and alignment begin before right bound + BamAlignment bAlignment; + while ( reader.GetNextAlignment(bAlignment) && (bAlignment.Position <= rightBound) ) { + // save alignment to archive + writer.SaveAlignment(bAlignment); + } + } + + // if jump failed + else { + cerr << "Could not jump to ref:pos " << referenceName << ":" << leftBound << endl; + exit(1); + } + + // clean up and exit + reader.Close(); + writer.Close(); + return 0; +} \ No newline at end of file diff --git a/BamWriter.cpp b/BamWriter.cpp index 4f6a37f..32e7047 100644 --- a/BamWriter.cpp +++ b/BamWriter.cpp @@ -1,8 +1,10 @@ // *************************************************************************** -// BamWriter (c) 2009 Michael Strömberg -// Marth Lab, Deptartment of Biology, Boston College +// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- +// Last modified: 24 June 2009 (DB) +// --------------------------------------------------------------------------- // The BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. // --------------------------------------------------------------------------- @@ -10,6 +12,8 @@ // *************************************************************************** #include "BamWriter.h" +using namespace BamTools; +using namespace std; // constructor BamWriter::BamWriter(void) diff --git a/BamWriter.h b/BamWriter.h index 18f8e0a..5294dc9 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -1,151 +1,114 @@ // *************************************************************************** -// BamWriter (c) 2009 Michael Strömberg -// Marth Lab, Deptartment of Biology, Boston College +// BamWriter.h (c) 2009 Michael Strömberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- +// Last modified: 24 June 2009 (DB) +// --------------------------------------------------------------------------- // The BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** +/*! + \file BamWriter.h + \brief API for writing BAM files. +*/ + #pragma once +// C++ includes #include #include -#include -#include -#include -#include -#include "BamAlignment.h" - -using namespace std; - -// our zlib constants -#define GZIP_ID1 31 -#define GZIP_ID2 139 -#define CM_DEFLATE 8 -#define FLG_FEXTRA 4 -#define OS_UNKNOWN 255 -#define BGZF_XLEN 6 -#define BGZF_ID1 66 -#define BGZF_ID2 67 -#define BGZF_LEN 2 -#define GZIP_WINDOW_BITS -15 -#define Z_DEFAULT_MEM_LEVEL 8 - -// our BZGF constants -#define BLOCK_HEADER_LENGTH 18 -#define BLOCK_FOOTER_LENGTH 8 -#define MAX_BLOCK_SIZE 65536 -#define DEFAULT_BLOCK_SIZE 65536 - -// our BAM constants -#define BAM_CORE_SIZE 32 -#define BAM_CMATCH 0 -#define BAM_CINS 1 -#define BAM_CDEL 2 -#define BAM_CREF_SKIP 3 -#define BAM_CSOFT_CLIP 4 -#define BAM_CHARD_CLIP 5 -#define BAM_CPAD 6 -#define BAM_CIGAR_SHIFT 4 -#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) +// zlib includes +#include -// our variable sizes -#define SIZEOF_INT 4 +// BamTools includes +#include "BamAux.h" -// define our BZGF structure -#ifndef BGZF_DATA -#define BGZF_DATA -struct BgzfData { - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; +namespace BamTools { - // constructor - BgzfData(void) - : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(MAX_BLOCK_SIZE) - , BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) - { - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch(bad_alloc&) { - printf("ERROR: Unable to allocate memory for our BGZF object.\n"); - exit(1); - } - } + //! API for writing BAM files. + class BamWriter { + + public: + + //! Constructor. + BamWriter(void); + + //! Destructor. + ~BamWriter(void); + + public: + + /*! + \brief Closes the alignment archive + \sa Open() + */ + void Close(void); + + /*! + \brief Opens the alignment archive + \param filename output BAM file + \param samHeader SAM-format header text + \param referenceSequences Reference sequence data + \sa Close() + */ + void Open(const std::string& filename, const std::string& samHeader, const RefVector& referenceSequences); + + /*! + \brief Saves an alignment to the archive + \param al The BamAlignment to be saved + */ + void SaveAlignment(const BamAlignment& al); + + // -------------------------------------------------------------------------------------- + // internal methods + private: + // closes the BAM file + void BgzfClose(void); + // compresses the current block + int BgzfDeflateBlock(void); + // flushes the data in the BGZF block + void BgzfFlushBlock(void); + // opens the BAM file for writing + void BgzfOpen(const std::string& filename); + // packs an unsigned integer into the specified buffer + static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value); + // packs an unsigned short into the specified buffer + static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value); + // writes the supplied data into the BGZF buffer + unsigned int BgzfWrite(const char* data, const unsigned int dataLen); + // calculates the minimum bin that contains a region [begin, end) + static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end); + // creates a packed cigar string from the supplied alignment + static void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); + // encodes the supplied query sequence into 4-bit notation + static void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + // our BGZF output object + BgzfData mBGZF; + }; - // destructor - ~BgzfData(void) { - if(CompressedBlock) delete [] CompressedBlock; - if(UncompressedBlock) delete [] UncompressedBlock; + //! \cond + // -------------------------------------------------------------------------------------- + // static inline methods (internal - can exclude from main documentation) + + // packs an unsigned integer into the specified buffer + inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); } -}; -#endif // BGZF_DATA -class BamWriter { -public: - // constructor - BamWriter(void); - // destructor - ~BamWriter(void); - // closes the alignment archive - void Close(void); - // opens the alignment archive - void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences); - // saves the alignment to the alignment archive - void SaveAlignment(const BamAlignment& al); -private: - // closes the BAM file - void BgzfClose(void); - // compresses the current block - int BgzfDeflateBlock(void); - // flushes the data in the BGZF block - void BgzfFlushBlock(void); - // opens the BAM file for writing - void BgzfOpen(const string& filename); - // packs an unsigned integer into the specified buffer - static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value); // packs an unsigned short into the specified buffer - static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value); - // writes the supplied data into the BGZF buffer - unsigned int BgzfWrite(const char* data, const unsigned int dataLen); - // calculates the minimum bin that contains a region [begin, end) - static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end); - // creates a packed cigar string from the supplied alignment - static void CreatePackedCigar(const vector& cigarOperations, string& packedCigar); - // encodes the supplied query sequence into 4-bit notation - static void EncodeQuerySequence(const string& query, string& encodedQuery); - // our BGZF output object - BgzfData mBGZF; -}; - -// packs an unsigned integer into the specified buffer -inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); - buffer[2] = (char)(value >> 16); - buffer[3] = (char)(value >> 24); -} - -// packs an unsigned short into the specified buffer -inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); -} + inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + } + // -------------------------------------------------------------------------------------- + //! \endcond + +} // end BamTools namespace \ No newline at end of file diff --git a/Makefile b/Makefile index 40e3ecd..a056ae5 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,15 @@ CXX= g++ CXXFLAGS= -Wall -O3 -PROG= BamReaderTest BamConversion +PROG= BamDump BamTrim LIBS= -lz all: $(PROG) -BamReaderTest: BamReader.o BamReaderMain.o - $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamReaderMain.o $(LIBS) +BamDump: BamReader.o BamDumpMain.o + $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamDumpMain.o $(LIBS) -BamConversion: BamReader.o BamWriter.o BamConversionMain.o - $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamConversionMain.o $(LIBS) - -BamMerge: BamReader.o BamWriter.o BamMerge.o - $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamMerge.o $(LIBS) +BamTrim: BamReader.o BamWriter.o BamTrimMain.o + $(CXX) $(CXXFLAGS) -o $@ BamReader.o BamWriter.o BamTrimMain.o $(LIBS) clean: - rm -fr gmon.out *.o *.a a.out *~ + rm -fr gmon.out *.o *.a a.out *~ diff --git a/README b/README index 9553d1a..a523b4c 100644 --- a/README +++ b/README @@ -1,28 +1,37 @@ ---------------- -README ----------------- - - -Significant re-write of BamReader API as of 5/18/2009: -------------------------------------------------------- -No longer using bgzf.h/.c -No longer using libbambc.a library -No longer using STL Utilities -Minor changes to API syntax in opening BAM files with BamReader - see BamReader.h for details - - -Compile instructions ----------------------- -Short version: -See Makefile for compilation example. - -Longer version: -Copy BamAlignment.h, BamReader.* and/or BamWriter.* to working directory. -#include "BamReader.h" and/or "BamWriter.h" -Use standard compile commands, including the "-lz" parameter to access zlib functionality. -(Note that "-L. -lbambc" are no longer necessary and WILL CAUSE PROBLEMS for compiler/linker ) - - -Feel free to email me or pop by my desk with any questions, comments, suggestions, etc. - - Derek - +------------------------------------------------------------ + README +------------------------------------------------------------ + +BamTools: a C++ API for reading/writing BAM files. + +The API consists of 2 main modules - BamReader and BamWriter. As you would expect, +BamReader provides read-access to BAM files, while BamWriter does the writing of BAM +files. BamReader provides an interface for random-access (jumping) in a BAM file. + +An additional file, BamAux.h, is included as well. +This file contains the common data structures and typedefs used throught the API. + +------------------------------------------------------------ + +To use this API, you simply need to do 3 things: + + 1 - Drop the BamTools files somewhere the compiler can find them. + (i.e. in source directory, or somewhere in include path) + + 2 - Import BamTools API with the following lines of code + + #include "BamReader.h" // as needed + #include "BamWriter.h" // as needed + using namespace BamTools; + + 3 - Compile with '-lz' ('l' as in 'lion') to access ZLIB compression library + +See any included programs and Makefile for more specific compiling/usage examples. +See documentation & comments in header files for API details. + +------------------------------------------------------------ + +Feel free to contact me with any questions, comments, suggestions, bug reports, etc. + - Derek Barnett + +http://sourceforge.net/projects/bamtools