First step in breaking up the API's monolithic classes. Should allow easier maintenance, testing, and adding features as we go forward.
namespace BamTools {
+// forward declare BamAlignment's friend classes
+namespace Internal {
+ class BamReaderPrivate;
+ class BamWriterPrivate;
+} // namespace Internal
+
// BamAlignment data structure
// explicitly labeled as 'struct' to indicate that (most of) its fields are public
struct API_EXPORT BamAlignment {
int32_t MatePosition; // Position (0-based) where alignment's mate starts
int32_t InsertSize; // Mate-pair insert size
- public:
+ // Internal data, inaccessible to client code
+ // but available BamReaderPrivate & BamWriterPrivate
+ private:
struct BamAlignmentSupportData {
// data members
, HasCoreOnly(false)
{ }
};
-
- // ** THIS IS INTERNAL DATA! DO NOT ACCESS OR EDIT FROM CLIENT CODE **
- //
- // Intended for use by BamReader & BamWriter ONLY. No, really, I mean it.
- //
- // BamTools makes some assumptions about this data being pristine, so please don't tinker with it.
- // The regular data fields above should be sufficient for client code.
- //
- // Technical/design note - Ideally, this would be a private data member with BamReader & BamWriter
- // allowed direct 'friend' access. However older compilers (especially gcc before v4.1 ) do not
- // propagate the friend access to BamReader/Writer's implementation inner classes.
- BamAlignmentSupportData SupportData;
+ BamAlignmentSupportData SupportData;
+ friend class Internal::BamReaderPrivate;
+ friend class Internal::BamWriterPrivate;
// Alignment flag query constants
// Use the get/set methods above instead
// ---------------------------------------------------------------------------
// Last modified: 19 November 2010 (DB)
// ---------------------------------------------------------------------------
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
-// Institute.
-// ---------------------------------------------------------------------------
// Provides the basic functionality for reading BAM files
// ***************************************************************************
#include <api/BamReader.h>
-#include <api/BGZF.h>
+#include <api/BamReader_p.h>
using namespace BamTools;
+using namespace BamTools::Internal;
#include <algorithm>
#include <iostream>
#include <vector>
using namespace std;
-struct BamReader::BamReaderPrivate {
-
- // -------------------------------
- // structs, enums, typedefs
- // -------------------------------
- enum RegionState { BEFORE_REGION = 0
- , WITHIN_REGION
- , AFTER_REGION
- };
-
- // -------------------------------
- // data members
- // -------------------------------
-
- // general file data
- BgzfData mBGZF;
- string HeaderText;
- BamIndex* Index;
- RefVector References;
- bool HasIndex;
- int64_t AlignmentsBeginOffset;
- string Filename;
- string IndexFilename;
-
- // index caching mode
- BamIndex::BamIndexCacheMode IndexCacheMode;
-
- // system data
- bool IsBigEndian;
-
- // user-specified region values
- BamRegion Region;
- bool HasAlignmentsInRegion;
-
- // parent BamReader
- BamReader* Parent;
-
- // BAM character constants
- const char* DNA_LOOKUP;
- const char* CIGAR_LOOKUP;
-
- // constructor & destructor
- BamReaderPrivate(BamReader* parent);
- ~BamReaderPrivate(void);
-
- // -------------------------------
- // "public" interface
-
- // file operations
- void Close(void);
- bool Open(const std::string& filename,
- const std::string& indexFilename,
- const bool lookForIndex,
- const bool preferStandardIndex);
- bool Rewind(void);
- bool SetRegion(const BamRegion& region);
-
- // access alignment data
- bool GetNextAlignment(BamAlignment& bAlignment);
- bool GetNextAlignmentCore(BamAlignment& bAlignment);
-
- // access auxiliary data
- int GetReferenceID(const string& refName) const;
-
- // index operations
- bool CreateIndex(bool useStandardIndex);
- void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
-
-
- // internal methods
- private:
-
- // ---------------------------------------
- // reading alignments and auxiliary data
-
- // adjusts requested region if necessary (depending on where data actually begins)
- void AdjustRegion(BamRegion& region);
- // fills out character data for BamAlignment data
- bool BuildCharData(BamAlignment& bAlignment);
- // checks to see if alignment overlaps current region
- RegionState IsOverlap(BamAlignment& bAlignment);
- // retrieves header text from BAM file
- void LoadHeaderData(void);
- // retrieves BAM alignment under file pointer
- bool LoadNextAlignment(BamAlignment& bAlignment);
- // builds reference data structure from BAM file
- void LoadReferenceData(void);
- // mark references with 'HasAlignments' status
- void MarkReferences(void);
-
- // ---------------------------------
- // index file handling
-
- // clear out inernal index data structure
- void ClearIndex(void);
- // loads index from BAM index file
- bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);
-};
-
-// -----------------------------------------------------
-// BamReader implementation (wrapper around BRPrivate)
-// -----------------------------------------------------
// constructor
BamReader::BamReader(void) {
d = new BamReaderPrivate(this);
bool BamReader::IsIndexLoaded(void) const { return HasIndex(); }
bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }
bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); }
-bool BamReader::Open(const std::string& filename,
- const std::string& indexFilename,
- const bool lookForIndex,
- const bool preferStandardIndex)
-{
- return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex);
+bool BamReader::Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex)
+{
+ return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex);
}
bool BamReader::Rewind(void) { return d->Rewind(); }
bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }
bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }
// access auxiliary data
-const string BamReader::GetHeaderText(void) const { return d->HeaderText; }
+const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); }
int BamReader::GetReferenceCount(void) const { return d->References.size(); }
const RefVector& BamReader::GetReferenceData(void) const { return d->References; }
int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }
// index operations
bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); }
void BamReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { d->SetIndexCacheMode(mode); }
-
-// -----------------------------------------------------
-// BamReaderPrivate implementation
-// -----------------------------------------------------
-
-// constructor
-BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)
- : Index(0)
- , HasIndex(false)
- , AlignmentsBeginOffset(0)
- , IndexCacheMode(BamIndex::LimitedIndexCaching)
- , HasAlignmentsInRegion(true)
- , Parent(parent)
- , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
- , CIGAR_LOOKUP("MIDNSHP")
-{
- IsBigEndian = SystemIsBigEndian();
-}
-
-// destructor
-BamReader::BamReaderPrivate::~BamReaderPrivate(void) {
- Close();
-}
-
-// adjusts requested region if necessary (depending on where data actually begins)
-void BamReader::BamReaderPrivate::AdjustRegion(BamRegion& region) {
-
- // check for valid index first
- if ( Index == 0 ) return;
-
- // see if any references in region have alignments
- HasAlignmentsInRegion = false;
- int currentId = region.LeftRefID;
- while ( currentId <= region.RightRefID ) {
- HasAlignmentsInRegion = Index->HasAlignments(currentId);
- if ( HasAlignmentsInRegion ) break;
- ++currentId;
- }
-
- // if no data found on any reference in region
- if ( !HasAlignmentsInRegion ) return;
-
- // if left bound of desired region had no data, use first reference that had data
- // otherwise, leave requested region as-is
- if ( currentId != region.LeftRefID ) {
- region.LeftRefID = currentId;
- region.LeftPosition = 0;
- }
-}
-
-// fills out character data for BamAlignment data
-bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {
-
- // calculate character lengths/offsets
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
- const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);
- const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;
- const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;
- const unsigned int tagDataLength = dataLength - tagDataOffset;
-
- // set up char buffers
- const char* allCharData = bAlignment.SupportData.AllCharData.data();
- const char* seqData = ((const char*)allCharData) + seqDataOffset;
- const char* qualData = ((const char*)allCharData) + qualDataOffset;
- char* tagData = ((char*)allCharData) + tagDataOffset;
-
- // store alignment name (relies on null char in name as terminator)
- bAlignment.Name.assign((const char*)(allCharData));
-
- // save query sequence
- bAlignment.QueryBases.clear();
- bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);
- for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
- char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
- bAlignment.QueryBases.append(1, singleBase);
- }
-
- // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
- bAlignment.Qualities.clear();
- bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);
- for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
- char singleQuality = (char)(qualData[i]+33);
- bAlignment.Qualities.append(1, singleQuality);
- }
-
- // if QueryBases is empty (and this is a allowed case)
- if ( bAlignment.QueryBases.empty() )
- bAlignment.AlignedBases = bAlignment.QueryBases;
-
- // if QueryBases contains data, then build AlignedBases using CIGAR data
- else {
-
- // resize AlignedBases
- bAlignment.AlignedBases.clear();
- bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);
-
- // iterate over CigarOps
- int k = 0;
- vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();
- vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();
- for ( ; cigarIter != cigarEnd; ++cigarIter ) {
-
- const CigarOp& op = (*cigarIter);
- switch(op.Type) {
-
- case ('M') :
- case ('I') :
- bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases
- // fall through
-
- case ('S') :
- k += op.Length; // for 'S' - soft clip, skip over query bases
- break;
-
- case ('D') :
- bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character
- break;
-
- case ('P') :
- bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character
- break;
-
- case ('N') :
- bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence
- break;
-
- case ('H') :
- break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op
-
- default:
- fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here
- exit(1);
- }
- }
- }
-
- // -----------------------
- // Added: 3-25-2010 DB
- // Fixed: endian-correctness for tag data
- // -----------------------
- if ( IsBigEndian ) {
- int i = 0;
- while ( (unsigned int)i < tagDataLength ) {
-
- i += 2; // skip tag type (e.g. "RG", "NM", etc)
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
- ++i; // skip value type
-
- switch (type) {
-
- case('A') :
- case('C') :
- ++i;
- break;
-
- case('S') :
- SwapEndian_16p(&tagData[i]);
- i += sizeof(uint16_t);
- break;
-
- case('F') :
- case('I') :
- SwapEndian_32p(&tagData[i]);
- i += sizeof(uint32_t);
- break;
-
- case('D') :
- SwapEndian_64p(&tagData[i]);
- i += sizeof(uint64_t);
- break;
-
- case('H') :
- case('Z') :
- while (tagData[i]) { ++i; }
- ++i; // increment one more for null terminator
- break;
-
- default :
- fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
- exit(1);
- }
- }
- }
-
- // store TagData
- bAlignment.TagData.clear();
- bAlignment.TagData.resize(tagDataLength);
- memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);
-
- // clear the core-only flag
- bAlignment.SupportData.HasCoreOnly = false;
-
- // return success
- return true;
-}
-
-// clear index data structure
-void BamReader::BamReaderPrivate::ClearIndex(void) {
- delete Index;
- Index = 0;
- HasIndex = false;
-}
-
-// closes the BAM file
-void BamReader::BamReaderPrivate::Close(void) {
-
- // close BGZF file stream
- mBGZF.Close();
-
- // clear out index data
- ClearIndex();
-
- // clear out header data
- HeaderText.clear();
-
- // clear out region flags
- Region.clear();
-}
-
-// creates index for BAM file, saves to file
-// default behavior is to create the BAM standard index (".bai")
-// set flag to false to create the BamTools-specific index (".bti")
-bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) {
-
- // clear out prior index data
- ClearIndex();
-
- // create index based on type requested
- if ( useStandardIndex )
- Index = new BamStandardIndex(&mBGZF, Parent);
- else
- Index = new BamToolsIndex(&mBGZF, Parent);
-
- // set index cache mode to full for writing
- Index->SetCacheMode(BamIndex::FullIndexCaching);
-
- // build new index
- bool ok = true;
- ok &= Index->Build();
- HasIndex = ok;
-
- // mark empty references
- MarkReferences();
-
- // attempt to save index data to file
- ok &= Index->Write(Filename);
-
- // set client's desired index cache mode
- Index->SetCacheMode(IndexCacheMode);
-
- // return success/fail of both building & writing index
- return ok;
-}
-
-// get next alignment (from specified region, if given)
-bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {
-
- // if valid alignment found, attempt to parse char data, and return success/failure
- if ( GetNextAlignmentCore(bAlignment) )
- return BuildCharData(bAlignment);
-
- // no valid alignment found
- else return false;
-}
-
-// retrieves next available alignment core data (returns success/fail)
-// ** DOES NOT parse any character data (read name, bases, qualities, tag data)
-// these can be accessed, if necessary, from the supportData
-// useful for operations requiring ONLY positional or other alignment-related information
-bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {
-
- // if region is set but has no alignments
- if ( !Region.isNull() && !HasAlignmentsInRegion )
- return false;
-
- // if valid alignment available
- if ( LoadNextAlignment(bAlignment) ) {
-
- // set core-only flag
- bAlignment.SupportData.HasCoreOnly = true;
-
- // if region not specified with at least a left boundary, return success
- if ( !Region.isLeftBoundSpecified() ) return true;
-
- // determine region state (before, within, after)
- BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);
-
- // if alignment lies after region, return false
- if ( state == AFTER_REGION ) return false;
-
- while ( state != WITHIN_REGION ) {
- // if no valid alignment available (likely EOF) return failure
- if ( !LoadNextAlignment(bAlignment) ) return false;
- // if alignment lies after region, return false (no available read within region)
- state = IsOverlap(bAlignment);
- if ( state == AFTER_REGION ) return false;
- }
-
- // return success (alignment found that overlaps region)
- return true;
- }
-
- // no valid alignment
- else return false;
-}
-
-// returns RefID for given RefName (returns References.size() if not found)
-int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {
-
- // retrieve names from reference data
- vector<string> refNames;
- RefVector::const_iterator refIter = References.begin();
- RefVector::const_iterator refEnd = References.end();
- for ( ; refIter != refEnd; ++refIter)
- refNames.push_back( (*refIter).RefName );
-
- // return 'index-of' refName ( if not found, returns refNames.size() )
- return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
-}
-
-// returns region state - whether alignment ends before, overlaps, or starts after currently specified region
-// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
-BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
-
- // if alignment is on any reference sequence before left bound
- if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION;
-
- // if alignment starts on left bound reference
- else if ( bAlignment.RefID == Region.LeftRefID ) {
-
- // if alignment starts at or after left boundary
- if ( bAlignment.Position >= Region.LeftPosition) {
-
- // if right boundary is specified AND
- // left/right boundaries are on same reference AND
- // alignment starts past right boundary
- if ( Region.isRightBoundSpecified() &&
- Region.LeftRefID == Region.RightRefID &&
- bAlignment.Position > Region.RightPosition )
- return AFTER_REGION;
-
- // otherwise, alignment is within region
- return WITHIN_REGION;
- }
-
- // alignment starts before left boundary
- else {
- // check if alignment overlaps left boundary
- if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION;
- else return BEFORE_REGION;
- }
- }
-
- // alignment starts on a reference after the left bound
- else {
-
- // if region has a right boundary
- if ( Region.isRightBoundSpecified() ) {
-
- // alignment is on reference between boundaries
- if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION;
-
- // alignment is on reference after right boundary
- else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION;
-
- // alignment is on right bound reference
- else {
- // check if alignment starts before or at right boundary
- if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION;
- else return AFTER_REGION;
- }
- }
-
- // otherwise, alignment is after left bound reference, but there is no right boundary
- else return WITHIN_REGION;
- }
-}
-
-// load BAM header data
-void BamReader::BamReaderPrivate::LoadHeaderData(void) {
-
- // check to see if proper BAM header
- char buffer[4];
- if (mBGZF.Read(buffer, 4) != 4) {
- fprintf(stderr, "Could not read header type\n");
- exit(1);
- }
-
- if (strncmp(buffer, "BAM\001", 4)) {
- fprintf(stderr, "wrong header type!\n");
- exit(1);
- }
-
- // get BAM header text length
- mBGZF.Read(buffer, 4);
- unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
- if ( IsBigEndian ) SwapEndian_32(headerTextLength);
-
- // get BAM header text
- char* headerText = (char*)calloc(headerTextLength + 1, 1);
- mBGZF.Read(headerText, headerTextLength);
- HeaderText = (string)((const char*)headerText);
-
- // clean up calloc-ed temp variable
- free(headerText);
-}
-
-// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail
-bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {
-
- // clear out any existing index data
- ClearIndex();
-
- // if no index filename provided, so we need to look for available index files
- if ( IndexFilename.empty() ) {
-
- // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag
- const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);
- Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type);
-
- // if null, return failure
- if ( Index == 0 ) return false;
-
- // generate proper IndexFilename based on type of index created
- IndexFilename = Filename + Index->Extension();
- }
-
- else {
-
- // attempt to load BamIndex based on IndexFilename provided by client
- Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent);
-
- // if null, return failure
- if ( Index == 0 ) return false;
- }
-
- // set cache mode for BamIndex
- Index->SetCacheMode(IndexCacheMode);
-
- // loading the index data from file
- HasIndex = Index->Load(IndexFilename);
-
- // mark empty references
- MarkReferences();
-
- // return index status
- return HasIndex;
-}
-
-// populates BamAlignment with alignment data under file pointer, returns success/fail
-bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
-
- // read in the 'block length' value, make sure it's not zero
- char buffer[4];
- mBGZF.Read(buffer, 4);
- bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);
- if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }
- if ( bAlignment.SupportData.BlockLength == 0 ) return false;
-
- // read in core alignment data, make sure the right size of data was read
- char x[BAM_CORE_SIZE];
- if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false;
-
- if ( IsBigEndian ) {
- for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) )
- SwapEndian_32p(&x[i]);
- }
-
- // set BamAlignment 'core' and 'support' data
- bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);
- bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);
-
- unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);
- bAlignment.Bin = tempValue >> 16;
- bAlignment.MapQuality = tempValue >> 8 & 0xff;
- bAlignment.SupportData.QueryNameLength = tempValue & 0xff;
-
- tempValue = BgzfData::UnpackUnsignedInt(&x[12]);
- bAlignment.AlignmentFlag = tempValue >> 16;
- bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;
-
- bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);
- bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);
- bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);
- bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);
-
- // set BamAlignment length
- bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;
-
- // read in character data - make sure proper data size was read
- bool readCharDataOK = false;
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
- char* allCharData = (char*)calloc(sizeof(char), dataLength);
-
- if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) {
-
- // store 'allCharData' in supportData structure
- bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
-
- // set success flag
- readCharDataOK = true;
-
- // save CIGAR ops
- // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
- // even when BamReader::GetNextAlignmentCore() is called
- const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
- CigarOp op;
- bAlignment.CigarData.clear();
- bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);
- for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {
-
- // swap if necessary
- if ( IsBigEndian ) SwapEndian_32(cigarData[i]);
-
- // build CigarOp structure
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
-
- // save CigarOp
- bAlignment.CigarData.push_back(op);
- }
- }
-
- free(allCharData);
- return readCharDataOK;
-}
-
-// loads reference data from BAM file
-void BamReader::BamReaderPrivate::LoadReferenceData(void) {
-
- // get number of reference sequences
- char buffer[4];
- mBGZF.Read(buffer, 4);
- unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);
- if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);
- if ( numberRefSeqs == 0 ) return;
- References.reserve((int)numberRefSeqs);
-
- // iterate over all references in header
- for (unsigned int i = 0; i != numberRefSeqs; ++i) {
-
- // get length of reference name
- mBGZF.Read(buffer, 4);
- unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);
- if ( IsBigEndian ) SwapEndian_32(refNameLength);
- char* refName = (char*)calloc(refNameLength, 1);
-
- // get reference name and reference sequence length
- mBGZF.Read(refName, refNameLength);
- mBGZF.Read(buffer, 4);
- int refLength = BgzfData::UnpackSignedInt(buffer);
- if ( IsBigEndian ) SwapEndian_32(refLength);
-
- // store data for reference
- RefData aReference;
- aReference.RefName = (string)((const char*)refName);
- aReference.RefLength = refLength;
- References.push_back(aReference);
-
- // clean up calloc-ed temp variable
- free(refName);
- }
-}
-
-// mark references with no alignment data
-void BamReader::BamReaderPrivate::MarkReferences(void) {
-
- // ensure index is available
- if ( !HasIndex ) return;
-
- // mark empty references
- for ( int i = 0; i < (int)References.size(); ++i )
- References.at(i).RefHasAlignments = Index->HasAlignments(i);
-}
-
-// opens BAM file (and index)
-bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {
-
- // store filenames
- Filename = filename;
- IndexFilename = indexFilename;
-
- // open the BGZF file for reading, return false on failure
- if ( !mBGZF.Open(filename, "rb") ) return false;
-
- // retrieve header text & reference data
- LoadHeaderData();
- LoadReferenceData();
-
- // store file offset of first alignment
- AlignmentsBeginOffset = mBGZF.Tell();
-
- // if no index filename provided
- if ( IndexFilename.empty() ) {
-
- // client did not specify that index SHOULD be found
- // useful for cases where sequential access is all that is required
- if ( !lookForIndex ) return true;
-
- // otherwise, look for index file, return success/fail
- return LoadIndex(lookForIndex, preferStandardIndex) ;
- }
-
- // client supplied an index filename
- // attempt to load index data, return success/fail
- return LoadIndex(lookForIndex, preferStandardIndex);
-}
-
-// returns BAM file pointer to beginning of alignment data
-bool BamReader::BamReaderPrivate::Rewind(void) {
-
- // rewind to first alignment, return false if unable to seek
- if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;
-
- // retrieve first alignment data, return false if unable to read
- BamAlignment al;
- if ( !LoadNextAlignment(al) ) return false;
-
- // reset default region info using first alignment in file
- Region.clear();
- HasAlignmentsInRegion = true;
-
- // rewind back to beginning of first alignment
- // return success/fail of seek
- return mBGZF.Seek(AlignmentsBeginOffset);
-}
-
-// change the index caching behavior
-void BamReader::BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
- IndexCacheMode = mode;
- if ( Index == 0 ) return;
- Index->SetCacheMode(mode);
-}
-
-// asks Index to attempt a Jump() to specified region
-// returns success/failure
-bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {
-
- // clear out any prior BamReader region data
- //
- // N.B. - this is cleared so that BamIndex now has free reign to call
- // GetNextAlignmentCore() and do overlap checking without worrying about BamReader
- // performing any overlap checking of its own and moving on to the next read... Calls
- // to GetNextAlignmentCore() with no Region set, simply return the next alignment.
- // This ensures that the Index is able to do just that. (All without exposing
- // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature)
- Region.clear();
-
- // check for existing index
- if ( !HasIndex ) return false;
-
- // adjust region if necessary to reflect where data actually begins
- BamRegion adjustedRegion(region);
- AdjustRegion(adjustedRegion);
-
- // if no data present, return true
- // not an error, but BamReader knows that no data is there for future alignment access
- // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions
- // that other BAMs have data)
- if ( !HasAlignmentsInRegion ) {
- Region = adjustedRegion;
- return true;
- }
-
- // attempt jump to user-specified region return false if jump could not be performed at all
- // (invalid index, unknown reference, etc)
- //
- // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag
- // * This covers case where a region is requested that lies beyond the last alignment on a reference
- // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false
- // BamMultiReader is then able to successfully pull alignments from a region from multiple files
- // even if one or more have no data.
- if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false;
-
- // save region and return success
- Region = adjustedRegion;
- return true;
-}
// ---------------------------------------------------------------------------\r
// Last modified: 19 November 2010 (DB)\r
// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
// Provides the basic functionality for reading BAM files\r
// ***************************************************************************\r
\r
\r
namespace BamTools {\r
\r
+namespace Internal {\r
+ class BamReaderPrivate;\r
+} // namespace Internal\r
+\r
class API_EXPORT BamReader {\r
\r
// constructor / destructor\r
\r
// private implementation\r
private:\r
- struct BamReaderPrivate;\r
- BamReaderPrivate* d;\r
+ Internal::BamReaderPrivate* d;\r
};\r
\r
} // namespace BamTools\r
--- /dev/null
+// ***************************************************************************
+// BamReader_p.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#include <api/BamReader.h>
+#include <api/BamReader_p.h>
+#include <api/BGZF.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <vector>
+using namespace std;
+
+// constructor
+BamReaderPrivate::BamReaderPrivate(BamReader* parent)
+ : HeaderText("")
+ , Index(0)
+ , HasIndex(false)
+ , AlignmentsBeginOffset(0)
+// , m_header(0)
+ , IndexCacheMode(BamIndex::LimitedIndexCaching)
+ , HasAlignmentsInRegion(true)
+ , Parent(parent)
+ , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
+ , CIGAR_LOOKUP("MIDNSHP")
+{
+ IsBigEndian = SystemIsBigEndian();
+}
+
+// destructor
+BamReaderPrivate::~BamReaderPrivate(void) {
+ Close();
+}
+
+// adjusts requested region if necessary (depending on where data actually begins)
+void BamReaderPrivate::AdjustRegion(BamRegion& region) {
+
+ // check for valid index first
+ if ( Index == 0 ) return;
+
+ // see if any references in region have alignments
+ HasAlignmentsInRegion = false;
+ int currentId = region.LeftRefID;
+ while ( currentId <= region.RightRefID ) {
+ HasAlignmentsInRegion = Index->HasAlignments(currentId);
+ if ( HasAlignmentsInRegion ) break;
+ ++currentId;
+ }
+
+ // if no data found on any reference in region
+ if ( !HasAlignmentsInRegion ) return;
+
+ // if left bound of desired region had no data, use first reference that had data
+ // otherwise, leave requested region as-is
+ if ( currentId != region.LeftRefID ) {
+ region.LeftRefID = currentId;
+ region.LeftPosition = 0;
+ }
+}
+
+// fills out character data for BamAlignment data
+bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {
+
+ // calculate character lengths/offsets
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;
+ const unsigned int tagDataLength = dataLength - tagDataOffset;
+
+ // set up char buffers
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();
+ const char* seqData = ((const char*)allCharData) + seqDataOffset;
+ const char* qualData = ((const char*)allCharData) + qualDataOffset;
+ char* tagData = ((char*)allCharData) + tagDataOffset;
+
+ // store alignment name (relies on null char in name as terminator)
+ bAlignment.Name.assign((const char*)(allCharData));
+
+ // save query sequence
+ bAlignment.QueryBases.clear();
+ bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+ bAlignment.QueryBases.append(1, singleBase);
+ }
+
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
+ bAlignment.Qualities.clear();
+ bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleQuality = (char)(qualData[i]+33);
+ bAlignment.Qualities.append(1, singleQuality);
+ }
+
+ // if QueryBases is empty (and this is a allowed case)
+ if ( bAlignment.QueryBases.empty() )
+ bAlignment.AlignedBases = bAlignment.QueryBases;
+
+ // if QueryBases contains data, then build AlignedBases using CIGAR data
+ else {
+
+ // resize AlignedBases
+ bAlignment.AlignedBases.clear();
+ bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+
+ // iterate over CigarOps
+ int k = 0;
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+
+ const CigarOp& op = (*cigarIter);
+ switch(op.Type) {
+
+ case ('M') :
+ case ('I') :
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases
+ // fall through
+
+ case ('S') :
+ k += op.Length; // for 'S' - soft clip, skip over query bases
+ break;
+
+ case ('D') :
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character
+ break;
+
+ case ('P') :
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character
+ break;
+
+ case ('N') :
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence
+ break;
+
+ case ('H') :
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op
+
+ default:
+ fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // -----------------------
+ // Added: 3-25-2010 DB
+ // Fixed: endian-correctness for tag data
+ // -----------------------
+ if ( IsBigEndian ) {
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i += sizeof(uint64_t);
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // store TagData
+ bAlignment.TagData.clear();
+ bAlignment.TagData.resize(tagDataLength);
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);
+
+ // clear the core-only flag
+ bAlignment.SupportData.HasCoreOnly = false;
+
+ // return success
+ return true;
+}
+
+// clear index data structure
+void BamReaderPrivate::ClearIndex(void) {
+ delete Index;
+ Index = 0;
+ HasIndex = false;
+}
+
+// closes the BAM file
+void BamReaderPrivate::Close(void) {
+
+ // close BGZF file stream
+ mBGZF.Close();
+
+ // clear out index data
+ ClearIndex();
+
+ // clear out header data
+ HeaderText.clear();
+// if ( m_header ) {
+// delete m_header;
+// m_header = 0;
+// }
+
+ // clear out region flags
+ Region.clear();
+}
+
+// creates index for BAM file, saves to file
+// default behavior is to create the BAM standard index (".bai")
+// set flag to false to create the BamTools-specific index (".bti")
+bool BamReaderPrivate::CreateIndex(bool useStandardIndex) {
+
+ // clear out prior index data
+ ClearIndex();
+
+ // create index based on type requested
+ if ( useStandardIndex )
+ Index = new BamStandardIndex(&mBGZF, Parent);
+ else
+ Index = new BamToolsIndex(&mBGZF, Parent);
+
+ // set index cache mode to full for writing
+ Index->SetCacheMode(BamIndex::FullIndexCaching);
+
+ // build new index
+ bool ok = true;
+ ok &= Index->Build();
+ HasIndex = ok;
+
+ // mark empty references
+ MarkReferences();
+
+ // attempt to save index data to file
+ ok &= Index->Write(Filename);
+
+ // set client's desired index cache mode
+ Index->SetCacheMode(IndexCacheMode);
+
+ // return success/fail of both building & writing index
+ return ok;
+}
+
+const string BamReaderPrivate::GetHeaderText(void) const {
+
+ return HeaderText;
+
+// if ( m_header )
+// return m_header->Text();
+// else
+// return string("");
+}
+
+// get next alignment (from specified region, if given)
+bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {
+
+ // if valid alignment found, attempt to parse char data, and return success/failure
+ if ( GetNextAlignmentCore(bAlignment) )
+ return BuildCharData(bAlignment);
+
+ // no valid alignment found
+ else return false;
+}
+
+// retrieves next available alignment core data (returns success/fail)
+// ** DOES NOT parse any character data (read name, bases, qualities, tag data)
+// these can be accessed, if necessary, from the supportData
+// useful for operations requiring ONLY positional or other alignment-related information
+bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {
+
+ // if region is set but has no alignments
+ if ( !Region.isNull() && !HasAlignmentsInRegion )
+ return false;
+
+ // if valid alignment available
+ if ( LoadNextAlignment(bAlignment) ) {
+
+ // set core-only flag
+ bAlignment.SupportData.HasCoreOnly = true;
+
+ // if region not specified with at least a left boundary, return success
+ if ( !Region.isLeftBoundSpecified() ) return true;
+
+ // determine region state (before, within, after)
+ BamReaderPrivate::RegionState state = IsOverlap(bAlignment);
+
+ // if alignment lies after region, return false
+ if ( state == AFTER_REGION ) return false;
+
+ while ( state != WITHIN_REGION ) {
+ // if no valid alignment available (likely EOF) return failure
+ if ( !LoadNextAlignment(bAlignment) ) return false;
+ // if alignment lies after region, return false (no available read within region)
+ state = IsOverlap(bAlignment);
+ if ( state == AFTER_REGION ) return false;
+ }
+
+ // return success (alignment found that overlaps region)
+ return true;
+ }
+
+ // no valid alignment
+ else return false;
+}
+
+// returns RefID for given RefName (returns References.size() if not found)
+int BamReaderPrivate::GetReferenceID(const string& refName) const {
+
+ // retrieve names from reference data
+ vector<string> refNames;
+ RefVector::const_iterator refIter = References.begin();
+ RefVector::const_iterator refEnd = References.end();
+ for ( ; refIter != refEnd; ++refIter)
+ refNames.push_back( (*refIter).RefName );
+
+ // return 'index-of' refName ( if not found, returns refNames.size() )
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
+}
+
+// returns region state - whether alignment ends before, overlaps, or starts after currently specified region
+// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
+BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
+
+ // if alignment is on any reference sequence before left bound
+ if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION;
+
+ // if alignment starts on left bound reference
+ else if ( bAlignment.RefID == Region.LeftRefID ) {
+
+ // if alignment starts at or after left boundary
+ if ( bAlignment.Position >= Region.LeftPosition) {
+
+ // if right boundary is specified AND
+ // left/right boundaries are on same reference AND
+ // alignment starts past right boundary
+ if ( Region.isRightBoundSpecified() &&
+ Region.LeftRefID == Region.RightRefID &&
+ bAlignment.Position > Region.RightPosition )
+ return AFTER_REGION;
+
+ // otherwise, alignment is within region
+ return WITHIN_REGION;
+ }
+
+ // alignment starts before left boundary
+ else {
+ // check if alignment overlaps left boundary
+ if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION;
+ else return BEFORE_REGION;
+ }
+ }
+
+ // alignment starts on a reference after the left bound
+ else {
+
+ // if region has a right boundary
+ if ( Region.isRightBoundSpecified() ) {
+
+ // alignment is on reference between boundaries
+ if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION;
+
+ // alignment is on reference after right boundary
+ else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION;
+
+ // alignment is on right bound reference
+ else {
+ // check if alignment starts before or at right boundary
+ if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION;
+ else return AFTER_REGION;
+ }
+ }
+
+ // otherwise, alignment is after left bound reference, but there is no right boundary
+ else return WITHIN_REGION;
+ }
+}
+
+// load BAM header data
+void BamReaderPrivate::LoadHeaderData(void) {
+
+// m_header = new BamHeader(&mBGZF);
+// bool headerLoadedOk = m_header->Load();
+// if ( !headerLoadedOk )
+// cerr << "BamReader could not load header" << endl;
+
+ // check to see if proper BAM header
+ char buffer[4];
+ if (mBGZF.Read(buffer, 4) != 4) {
+ fprintf(stderr, "Could not read header type\n");
+ exit(1);
+ }
+
+ if (strncmp(buffer, "BAM\001", 4)) {
+ fprintf(stderr, "wrong header type!\n");
+ exit(1);
+ }
+
+ // get BAM header text length
+ mBGZF.Read(buffer, 4);
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(headerTextLength);
+
+ // get BAM header text
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);
+ mBGZF.Read(headerText, headerTextLength);
+ HeaderText = (string)((const char*)headerText);
+
+ // clean up calloc-ed temp variable
+ free(headerText);
+}
+
+// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail
+bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {
+
+ // clear out any existing index data
+ ClearIndex();
+
+ // if no index filename provided, so we need to look for available index files
+ if ( IndexFilename.empty() ) {
+
+ // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag
+ const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);
+ Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+
+ // generate proper IndexFilename based on type of index created
+ IndexFilename = Filename + Index->Extension();
+ }
+
+ else {
+
+ // attempt to load BamIndex based on IndexFilename provided by client
+ Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+ }
+
+ // set cache mode for BamIndex
+ Index->SetCacheMode(IndexCacheMode);
+
+ // loading the index data from file
+ HasIndex = Index->Load(IndexFilename);
+
+ // mark empty references
+ MarkReferences();
+
+ // return index status
+ return HasIndex;
+}
+
+// populates BamAlignment with alignment data under file pointer, returns success/fail
+bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
+
+ // read in the 'block length' value, make sure it's not zero
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }
+ if ( bAlignment.SupportData.BlockLength == 0 ) return false;
+
+ // read in core alignment data, make sure the right size of data was read
+ char x[BAM_CORE_SIZE];
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false;
+
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) )
+ SwapEndian_32p(&x[i]);
+ }
+
+ // set BamAlignment 'core' and 'support' data
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);
+
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);
+ bAlignment.Bin = tempValue >> 16;
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;
+ bAlignment.SupportData.QueryNameLength = tempValue & 0xff;
+
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);
+ bAlignment.AlignmentFlag = tempValue >> 16;
+ bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;
+
+ bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);
+
+ // set BamAlignment length
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;
+
+ // read in character data - make sure proper data size was read
+ bool readCharDataOK = false;
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);
+
+ if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) {
+
+ // store 'allCharData' in supportData structure
+ bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
+
+ // set success flag
+ readCharDataOK = true;
+
+ // save CIGAR ops
+ // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
+ // even when GetNextAlignmentCore() is called
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
+ CigarOp op;
+ bAlignment.CigarData.clear();
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {
+
+ // swap if necessary
+ if ( IsBigEndian ) SwapEndian_32(cigarData[i]);
+
+ // build CigarOp structure
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
+
+ // save CigarOp
+ bAlignment.CigarData.push_back(op);
+ }
+ }
+
+ free(allCharData);
+ return readCharDataOK;
+}
+
+// loads reference data from BAM file
+void BamReaderPrivate::LoadReferenceData(void) {
+
+ // get number of reference sequences
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);
+ if ( numberRefSeqs == 0 ) return;
+ References.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {
+
+ // get length of reference name
+ mBGZF.Read(buffer, 4);
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refNameLength);
+ char* refName = (char*)calloc(refNameLength, 1);
+
+ // get reference name and reference sequence length
+ mBGZF.Read(refName, refNameLength);
+ mBGZF.Read(buffer, 4);
+ int refLength = BgzfData::UnpackSignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refLength);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName);
+ aReference.RefLength = refLength;
+ References.push_back(aReference);
+
+ // clean up calloc-ed temp variable
+ free(refName);
+ }
+}
+
+// mark references with no alignment data
+void BamReaderPrivate::MarkReferences(void) {
+
+ // ensure index is available
+ if ( !HasIndex ) return;
+
+ // mark empty references
+ for ( int i = 0; i < (int)References.size(); ++i )
+ References.at(i).RefHasAlignments = Index->HasAlignments(i);
+}
+
+// opens BAM file (and index)
+bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {
+
+ // store filenames
+ Filename = filename;
+ IndexFilename = indexFilename;
+
+ // open the BGZF file for reading, return false on failure
+ if ( !mBGZF.Open(filename, "rb") ) return false;
+
+ // retrieve header text & reference data
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store file offset of first alignment
+ AlignmentsBeginOffset = mBGZF.Tell();
+
+ // if no index filename provided
+ if ( IndexFilename.empty() ) {
+
+ // client did not specify that index SHOULD be found
+ // useful for cases where sequential access is all that is required
+ if ( !lookForIndex ) return true;
+
+ // otherwise, look for index file, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex) ;
+ }
+
+ // client supplied an index filename
+ // attempt to load index data, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex);
+}
+
+// returns BAM file pointer to beginning of alignment data
+bool BamReaderPrivate::Rewind(void) {
+
+ // rewind to first alignment, return false if unable to seek
+ if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;
+
+ // retrieve first alignment data, return false if unable to read
+ BamAlignment al;
+ if ( !LoadNextAlignment(al) ) return false;
+
+ // reset default region info using first alignment in file
+ Region.clear();
+ HasAlignmentsInRegion = true;
+
+ // rewind back to beginning of first alignment
+ // return success/fail of seek
+ return mBGZF.Seek(AlignmentsBeginOffset);
+}
+
+// change the index caching behavior
+void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
+ IndexCacheMode = mode;
+ if ( Index == 0 ) return;
+ Index->SetCacheMode(mode);
+}
+
+// asks Index to attempt a Jump() to specified region
+// returns success/failure
+bool BamReaderPrivate::SetRegion(const BamRegion& region) {
+
+ // clear out any prior BamReader region data
+ //
+ // N.B. - this is cleared so that BamIndex now has free reign to call
+ // GetNextAlignmentCore() and do overlap checking without worrying about BamReader
+ // performing any overlap checking of its own and moving on to the next read... Calls
+ // to GetNextAlignmentCore() with no Region set, simply return the next alignment.
+ // This ensures that the Index is able to do just that. (All without exposing
+ // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature)
+ Region.clear();
+
+ // check for existing index
+ if ( !HasIndex ) return false;
+
+ // adjust region if necessary to reflect where data actually begins
+ BamRegion adjustedRegion(region);
+ AdjustRegion(adjustedRegion);
+
+ // if no data present, return true
+ // not an error, but BamReader knows that no data is there for future alignment access
+ // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions
+ // that other BAMs have data)
+ if ( !HasAlignmentsInRegion ) {
+ Region = adjustedRegion;
+ return true;
+ }
+
+ // attempt jump to user-specified region return false if jump could not be performed at all
+ // (invalid index, unknown reference, etc)
+ //
+ // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag
+ // * This covers case where a region is requested that lies beyond the last alignment on a reference
+ // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false
+ // BamMultiReader is then able to successfully pull alignments from a region from multiple files
+ // even if one or more have no data.
+ if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false;
+
+ // save region and return success
+ Region = adjustedRegion;
+ return true;
+}
--- /dev/null
+// ***************************************************************************
+// BamReader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#ifndef BAMREADER_P_H
+#define BAMREADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <api/BamAlignment.h>
+#include <api/BamIndex.h>
+#include <api/BGZF.h>
+#include <string>
+
+namespace BamTools {
+
+class BamReader;
+
+namespace Internal {
+
+class BamReaderPrivate {
+
+ // enums
+ public: enum RegionState { BEFORE_REGION = 0
+ , WITHIN_REGION
+ , AFTER_REGION
+ };
+
+ // ctor & dtor
+ public:
+ BamReaderPrivate(BamReader* parent);
+ ~BamReaderPrivate(void);
+
+ // 'public' interface to BamReader
+ public:
+
+ // file operations
+ void Close(void);
+ bool Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex);
+ bool Rewind(void);
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ bool GetNextAlignment(BamAlignment& bAlignment);
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);
+
+ // access auxiliary data
+ const std::string GetHeaderText(void) const;
+ int GetReferenceID(const std::string& refName) const;
+
+ // index operations
+ bool CreateIndex(bool useStandardIndex);
+ void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
+
+ // 'internal' methods
+ public:
+
+ // ---------------------------------------
+ // reading alignments and auxiliary data
+
+ // adjusts requested region if necessary (depending on where data actually begins)
+ void AdjustRegion(BamRegion& region);
+ // fills out character data for BamAlignment data
+ bool BuildCharData(BamAlignment& bAlignment);
+ // checks to see if alignment overlaps current region
+ RegionState IsOverlap(BamAlignment& bAlignment);
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // retrieves BAM alignment under file pointer
+ bool LoadNextAlignment(BamAlignment& bAlignment);
+ // builds reference data structure from BAM file
+ void LoadReferenceData(void);
+ // mark references with 'HasAlignments' status
+ void MarkReferences(void);
+
+ // ---------------------------------
+ // index file handling
+
+ // clear out inernal index data structure
+ void ClearIndex(void);
+ // loads index from BAM index file
+ bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);
+
+ // data members
+ public:
+
+ // general file data
+ BgzfData mBGZF;
+ std::string HeaderText;
+ BamIndex* Index;
+ RefVector References;
+ bool HasIndex;
+ int64_t AlignmentsBeginOffset;
+ std::string Filename;
+ std::string IndexFilename;
+
+// Internal::BamHeader* m_header;
+
+ // index caching mode
+ BamIndex::BamIndexCacheMode IndexCacheMode;
+
+ // system data
+ bool IsBigEndian;
+
+ // user-specified region values
+ BamRegion Region;
+ bool HasAlignmentsInRegion;
+
+ // parent BamReader
+ BamReader* Parent;
+
+ // BAM character constants
+ const char* DNA_LOOKUP;
+ const char* CIGAR_LOOKUP;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMREADER_P_H
// ---------------------------------------------------------------------------\r
// Last modified: 19 November 2010 (DB)\r
// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
// Provides the basic functionality for producing BAM files\r
// ***************************************************************************\r
\r
#include <api/BamWriter.h>\r
-#include <api/BGZF.h>\r
+#include <api/BamWriter_p.h>\r
using namespace BamTools;\r
+using namespace BamTools::Internal;\r
\r
#include <iostream>\r
using namespace std;\r
\r
-struct BamWriter::BamWriterPrivate {\r
-\r
- // data members\r
- BgzfData mBGZF;\r
- bool IsBigEndian;\r
- \r
- // constructor / destructor\r
- BamWriterPrivate(void) { \r
- IsBigEndian = SystemIsBigEndian(); \r
- }\r
- \r
- ~BamWriterPrivate(void) {\r
- mBGZF.Close();\r
- }\r
-\r
- // "public" interface\r
- void Close(void);\r
- bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);\r
- void SaveAlignment(const BamAlignment& al);\r
-\r
- // internal methods\r
- const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
- void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
- void EncodeQuerySequence(const string& query, string& encodedQuery);\r
-};\r
-\r
-// -----------------------------------------------------\r
-// BamWriter implementation\r
-// -----------------------------------------------------\r
-\r
// constructor\r
BamWriter::BamWriter(void) {\r
d = new BamWriterPrivate;\r
}\r
\r
// closes the alignment archive\r
-void BamWriter::Close(void) { \r
- d->Close(); \r
+void BamWriter::Close(void) {\r
+ d->Close();\r
}\r
\r
// opens the alignment archive\r
-bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
+bool BamWriter::Open(const string& filename,\r
+ const string& samHeader,\r
+ const RefVector& referenceSequences,\r
+ bool isWriteUncompressed)\r
+{\r
return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
}\r
\r
// saves the alignment to the alignment archive\r
-void BamWriter::SaveAlignment(const BamAlignment& al) { \r
+void BamWriter::SaveAlignment(const BamAlignment& al) {\r
d->SaveAlignment(al);\r
}\r
-\r
-// -----------------------------------------------------\r
-// BamWriterPrivate implementation\r
-// -----------------------------------------------------\r
-\r
-// closes the alignment archive\r
-void BamWriter::BamWriterPrivate::Close(void) {\r
- mBGZF.Close();\r
-}\r
-\r
-// calculates minimum bin for a BAM alignment interval\r
-const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { \r
- --end;\r
- if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
- if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);\r
- if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);\r
- if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);\r
- if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);\r
- return 0;\r
-}\r
-\r
-// creates a cigar string from the supplied alignment\r
-void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
-\r
- // initialize\r
- const unsigned int numCigarOperations = cigarOperations.size();\r
- packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
-\r
- // pack the cigar data into the string\r
- unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
-\r
- unsigned int cigarOp;\r
- vector<CigarOp>::const_iterator coIter;\r
- for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
-\r
- switch(coIter->Type) {\r
- case 'M':\r
- cigarOp = BAM_CMATCH;\r
- break;\r
- case 'I':\r
- cigarOp = BAM_CINS;\r
- break;\r
- case 'D':\r
- cigarOp = BAM_CDEL;\r
- break;\r
- case 'N':\r
- cigarOp = BAM_CREF_SKIP;\r
- break;\r
- case 'S':\r
- cigarOp = BAM_CSOFT_CLIP;\r
- break;\r
- case 'H':\r
- cigarOp = BAM_CHARD_CLIP;\r
- break;\r
- case 'P':\r
- cigarOp = BAM_CPAD;\r
- break;\r
- default:\r
- fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
- exit(1);\r
- }\r
-\r
- *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
- pPackedCigar++;\r
- }\r
-}\r
-\r
-// encodes the supplied query sequence into 4-bit notation\r
-void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
-\r
- // prepare the encoded query string\r
- const unsigned int queryLen = query.size();\r
- const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
- encodedQuery.resize(encodedQueryLen);\r
- char* pEncodedQuery = (char*)encodedQuery.data();\r
- const char* pQuery = (const char*)query.data();\r
-\r
- unsigned char nucleotideCode;\r
- bool useHighWord = true;\r
-\r
- while(*pQuery) {\r
-\r
- switch(*pQuery) {\r
- \r
- case '=':\r
- nucleotideCode = 0;\r
- break;\r
- \r
- case 'A':\r
- nucleotideCode = 1;\r
- break;\r
- \r
- case 'C':\r
- nucleotideCode = 2;\r
- break;\r
- \r
- case 'G':\r
- nucleotideCode = 4;\r
- break;\r
- \r
- case 'T':\r
- nucleotideCode = 8;\r
- break;\r
- \r
- case 'N':\r
- nucleotideCode = 15;\r
- break;\r
- \r
- default:\r
- fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
- exit(1);\r
- }\r
-\r
- // pack the nucleotide code\r
- if(useHighWord) {\r
- *pEncodedQuery = nucleotideCode << 4;\r
- useHighWord = false;\r
- } else {\r
- *pEncodedQuery |= nucleotideCode;\r
- pEncodedQuery++;\r
- useHighWord = true;\r
- }\r
-\r
- // increment the query position\r
- pQuery++;\r
- }\r
-}\r
-\r
-// opens the alignment archive\r
-bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
-\r
- // open the BGZF file for writing, return failure if error\r
- if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )\r
- return false;\r
-\r
- // ================\r
- // write the header\r
- // ================\r
-\r
- // write the BAM signature\r
- const unsigned char SIGNATURE_LENGTH = 4;\r
- const char* BAM_SIGNATURE = "BAM\1";\r
- mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
-\r
- // write the SAM header text length\r
- uint32_t samHeaderLen = samHeader.size();\r
- if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
- mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
-\r
- // write the SAM header text\r
- if(samHeaderLen > 0) \r
- mBGZF.Write(samHeader.data(), samHeaderLen);\r
-\r
- // write the number of reference sequences\r
- uint32_t numReferenceSequences = referenceSequences.size();\r
- if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
- mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
-\r
- // =============================\r
- // write the sequence dictionary\r
- // =============================\r
-\r
- RefVector::const_iterator rsIter;\r
- for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
-\r
- // write the reference sequence name length\r
- uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
- if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
- mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
-\r
- // write the reference sequence name\r
- mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
-\r
- // write the reference sequence length\r
- int32_t referenceLength = rsIter->RefLength;\r
- if (IsBigEndian) SwapEndian_32(referenceLength);\r
- mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
- }\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-// saves the alignment to the alignment archive\r
-void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
-\r
- // if BamAlignment contains only the core data and a raw char data buffer\r
- // (as a result of BamReader::GetNextAlignmentCore())\r
- if ( al.SupportData.HasCoreOnly ) {\r
- \r
- // write the block size\r
- unsigned int blockSize = al.SupportData.BlockLength;\r
- if (IsBigEndian) SwapEndian_32(blockSize);\r
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
- // assign the BAM core data\r
- uint32_t buffer[8];\r
- buffer[0] = al.RefID;\r
- buffer[1] = al.Position;\r
- buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
- buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
- buffer[4] = al.SupportData.QuerySequenceLength;\r
- buffer[5] = al.MateRefID;\r
- buffer[6] = al.MatePosition;\r
- buffer[7] = al.InsertSize;\r
- \r
- // swap BAM core endian-ness, if necessary\r
- if ( IsBigEndian ) { \r
- for ( int i = 0; i < 8; ++i )\r
- SwapEndian_32(buffer[i]); \r
- }\r
- \r
- // write the BAM core\r
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
- \r
- // write the raw char data\r
- mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
- }\r
- \r
- // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
- // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
- else {\r
- \r
- // calculate char lengths\r
- const unsigned int nameLength = al.Name.size() + 1;\r
- const unsigned int numCigarOperations = al.CigarData.size();\r
- const unsigned int queryLength = al.QueryBases.size();\r
- const unsigned int tagDataLength = al.TagData.size();\r
- \r
- // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
- // force calculation of Bin before storing\r
- const int endPosition = al.GetEndPosition();\r
- const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
- \r
- // create our packed cigar string\r
- string packedCigar;\r
- CreatePackedCigar(al.CigarData, packedCigar);\r
- const unsigned int packedCigarLength = packedCigar.size();\r
-\r
- // encode the query\r
- string encodedQuery;\r
- EncodeQuerySequence(al.QueryBases, encodedQuery);\r
- const unsigned int encodedQueryLength = encodedQuery.size(); \r
- \r
- // write the block size\r
- const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
- unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
- if (IsBigEndian) SwapEndian_32(blockSize);\r
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
- // assign the BAM core data\r
- uint32_t buffer[8];\r
- buffer[0] = al.RefID;\r
- buffer[1] = al.Position;\r
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
- buffer[4] = queryLength;\r
- buffer[5] = al.MateRefID;\r
- buffer[6] = al.MatePosition;\r
- buffer[7] = al.InsertSize;\r
- \r
- // swap BAM core endian-ness, if necessary\r
- if ( IsBigEndian ) { \r
- for ( int i = 0; i < 8; ++i )\r
- SwapEndian_32(buffer[i]); \r
- }\r
- \r
- // write the BAM core\r
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
- \r
- // write the query name\r
- mBGZF.Write(al.Name.c_str(), nameLength);\r
-\r
- // write the packed cigar\r
- if ( IsBigEndian ) {\r
- \r
- char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
- memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
- \r
- for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
- if ( IsBigEndian )\r
- SwapEndian_32p(&cigarData[i]); \r
- }\r
- \r
- mBGZF.Write(cigarData, packedCigarLength);\r
- free(cigarData); \r
- } \r
- else \r
- mBGZF.Write(packedCigar.data(), packedCigarLength);\r
-\r
- // write the encoded query sequence\r
- mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
-\r
- // write the base qualities\r
- string baseQualities(al.Qualities);\r
- char* pBaseQualities = (char*)al.Qualities.data();\r
- for(unsigned int i = 0; i < queryLength; i++) { \r
- pBaseQualities[i] -= 33; \r
- }\r
- mBGZF.Write(pBaseQualities, queryLength);\r
-\r
- // write the read group tag\r
- if ( IsBigEndian ) {\r
- \r
- char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
- memcpy(tagData, al.TagData.data(), tagDataLength);\r
- \r
- int i = 0;\r
- while ( (unsigned int)i < tagDataLength ) {\r
- \r
- i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
- ++i; // skip value type\r
- \r
- switch (type) {\r
- \r
- case('A') :\r
- case('C') : \r
- ++i;\r
- break;\r
- \r
- case('S') : \r
- SwapEndian_16p(&tagData[i]); \r
- i+=2; // sizeof(uint16_t)\r
- break;\r
- \r
- case('F') :\r
- case('I') : \r
- SwapEndian_32p(&tagData[i]);\r
- i+=4; // sizeof(uint32_t)\r
- break;\r
- \r
- case('D') : \r
- SwapEndian_64p(&tagData[i]);\r
- i+=8; // sizeof(uint64_t)\r
- break;\r
- \r
- case('H') :\r
- case('Z') : \r
- while (tagData[i]) { ++i; }\r
- ++i; // increment one more for null terminator\r
- break;\r
- \r
- default : \r
- fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here\r
- free(tagData);\r
- exit(1); \r
- }\r
- }\r
- \r
- mBGZF.Write(tagData, tagDataLength);\r
- free(tagData);\r
- } \r
- else \r
- mBGZF.Write(al.TagData.data(), tagDataLength); \r
- }\r
-}\r
// ---------------------------------------------------------------------------\r
// Last modified: 19 November 2010 (DB)\r
// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
// Provides the basic functionality for producing BAM files\r
// ***************************************************************************\r
\r
\r
namespace BamTools {\r
\r
+namespace Internal {\r
+ class BamWriterPrivate;\r
+} // namespace Internal\r
+\r
class API_EXPORT BamWriter {\r
\r
// constructor/destructor\r
\r
// private implementation\r
private:\r
- struct BamWriterPrivate;\r
- BamWriterPrivate* d;\r
+ Internal::BamWriterPrivate* d;\r
};\r
\r
} // namespace BamTools\r
--- /dev/null
+// ***************************************************************************
+// BamWriter_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include <api/BamAlignment.h>
+#include <api/BamWriter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+BamWriterPrivate::BamWriterPrivate(void) {
+ IsBigEndian = SystemIsBigEndian();
+}
+
+BamWriterPrivate::~BamWriterPrivate(void) {
+ mBGZF.Close();
+}
+
+// closes the alignment archive
+void BamWriterPrivate::Close(void) {
+ mBGZF.Close();
+}
+
+// calculates minimum bin for a BAM alignment interval
+const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
+ --end;
+ if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
+ if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
+ if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
+ if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
+ if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
+ return 0;
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+ // initialize
+ const unsigned int numCigarOperations = cigarOperations.size();
+ packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
+
+ // pack the cigar data into the string
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+ unsigned int cigarOp;
+ vector<CigarOp>::const_iterator coIter;
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
+
+ switch(coIter->Type) {
+ case 'M':
+ cigarOp = BAM_CMATCH;
+ break;
+ case 'I':
+ cigarOp = BAM_CINS;
+ break;
+ case 'D':
+ cigarOp = BAM_CDEL;
+ break;
+ case 'N':
+ cigarOp = BAM_CREF_SKIP;
+ break;
+ case 'S':
+ cigarOp = BAM_CSOFT_CLIP;
+ break;
+ case 'H':
+ cigarOp = BAM_CHARD_CLIP;
+ break;
+ case 'P':
+ cigarOp = BAM_CPAD;
+ break;
+ default:
+ fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+ exit(1);
+ }
+
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+ pPackedCigar++;
+ }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+ // prepare the encoded query string
+ const unsigned int queryLen = query.size();
+ const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
+ encodedQuery.resize(encodedQueryLen);
+ char* pEncodedQuery = (char*)encodedQuery.data();
+ const char* pQuery = (const char*)query.data();
+
+ unsigned char nucleotideCode;
+ bool useHighWord = true;
+
+ while(*pQuery) {
+
+ switch(*pQuery) {
+
+ case '=':
+ nucleotideCode = 0;
+ break;
+
+ case 'A':
+ nucleotideCode = 1;
+ break;
+
+ case 'C':
+ nucleotideCode = 2;
+ break;
+
+ case 'G':
+ nucleotideCode = 4;
+ break;
+
+ case 'T':
+ nucleotideCode = 8;
+ break;
+
+ case 'N':
+ nucleotideCode = 15;
+ break;
+
+ default:
+ fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+ exit(1);
+ }
+
+ // pack the nucleotide code
+ if(useHighWord) {
+ *pEncodedQuery = nucleotideCode << 4;
+ useHighWord = false;
+ } else {
+ *pEncodedQuery |= nucleotideCode;
+ pEncodedQuery++;
+ useHighWord = true;
+ }
+
+ // increment the query position
+ pQuery++;
+ }
+}
+
+// opens the alignment archive
+bool BamWriterPrivate::Open(const string& filename,
+ const string& samHeader,
+ const RefVector& referenceSequences,
+ bool isWriteUncompressed)
+{
+ // open the BGZF file for writing, return failure if error
+ if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
+ return false;
+
+ // ================
+ // write the header
+ // ================
+
+ // write the BAM signature
+ const unsigned char SIGNATURE_LENGTH = 4;
+ const char* BAM_SIGNATURE = "BAM\1";
+ mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
+
+ // write the SAM header text length
+ uint32_t samHeaderLen = samHeader.size();
+ if (IsBigEndian) SwapEndian_32(samHeaderLen);
+ mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
+
+ // write the SAM header text
+ if(samHeaderLen > 0)
+ mBGZF.Write(samHeader.data(), samHeaderLen);
+
+ // write the number of reference sequences
+ uint32_t numReferenceSequences = referenceSequences.size();
+ if (IsBigEndian) SwapEndian_32(numReferenceSequences);
+ mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
+
+ // =============================
+ // write the sequence dictionary
+ // =============================
+
+ RefVector::const_iterator rsIter;
+ for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+
+ // write the reference sequence name length
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
+ if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
+ mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
+
+ // write the reference sequence name
+ mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+ // write the reference sequence length
+ int32_t referenceLength = rsIter->RefLength;
+ if (IsBigEndian) SwapEndian_32(referenceLength);
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
+ }
+
+ // return success
+ return true;
+}
+
+// saves the alignment to the alignment archive
+void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
+
+ // if BamAlignment contains only the core data and a raw char data buffer
+ // (as a result of BamReader::GetNextAlignmentCore())
+ if ( al.SupportData.HasCoreOnly ) {
+
+ // write the block size
+ unsigned int blockSize = al.SupportData.BlockLength;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+ buffer[4] = al.SupportData.QuerySequenceLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the raw char data
+ mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
+ }
+
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
+ // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
+ else {
+
+ // calculate char lengths
+ const unsigned int nameLength = al.Name.size() + 1;
+ const unsigned int numCigarOperations = al.CigarData.size();
+ const unsigned int queryLength = al.QueryBases.size();
+ const unsigned int tagDataLength = al.TagData.size();
+
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
+ // force calculation of Bin before storing
+ const int endPosition = al.GetEndPosition();
+ const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+
+ // create our packed cigar string
+ string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLength = packedCigar.size();
+
+ // encode the query
+ string encodedQuery;
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ const unsigned int encodedQueryLength = encodedQuery.size();
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+ buffer[4] = queryLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the query name
+ mBGZF.Write(al.Name.c_str(), nameLength);
+
+ // write the packed cigar
+ if ( IsBigEndian ) {
+
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+
+ for (unsigned int i = 0; i < packedCigarLength; ++i) {
+ if ( IsBigEndian )
+ SwapEndian_32p(&cigarData[i]);
+ }
+
+ mBGZF.Write(cigarData, packedCigarLength);
+ free(cigarData);
+ }
+ else
+ mBGZF.Write(packedCigar.data(), packedCigarLength);
+
+ // write the encoded query sequence
+ mBGZF.Write(encodedQuery.data(), encodedQueryLength);
+
+ // write the base qualities
+ string baseQualities(al.Qualities);
+ char* pBaseQualities = (char*)al.Qualities.data();
+ for(unsigned int i = 0; i < queryLength; i++) {
+ pBaseQualities[i] -= 33;
+ }
+ mBGZF.Write(pBaseQualities, queryLength);
+
+ // write the read group tag
+ if ( IsBigEndian ) {
+
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);
+ memcpy(tagData, al.TagData.data(), tagDataLength);
+
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i+=2; // sizeof(uint16_t)
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i+=4; // sizeof(uint32_t)
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i+=8; // sizeof(uint64_t)
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ free(tagData);
+ exit(1);
+ }
+ }
+
+ mBGZF.Write(tagData, tagDataLength);
+ free(tagData);
+ }
+ else
+ mBGZF.Write(al.TagData.data(), tagDataLength);
+ }
+}
--- /dev/null
+// ***************************************************************************
+// BamWriter_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_P_H
+#define BAMWRITER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <api/BamAux.h>
+#include <api/BGZF.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+namespace Internal {
+
+class BamWriterPrivate {
+
+ // ctor & dtor
+ public:
+ BamWriterPrivate(void);
+ ~BamWriterPrivate(void);
+
+ // "public" interface to BamWriter
+ public:
+ void Close(void);
+ bool Open(const std::string& filename,
+ const std::string& samHeader,
+ const BamTools::RefVector& referenceSequences,
+ bool isWriteUncompressed);
+ void SaveAlignment(const BamAlignment& al);
+
+ // internal methods
+ public:
+ const unsigned int CalculateMinimumBin(const int begin, int end) const;
+ void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar);
+ void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
+
+ // data members
+ public:
+ BgzfData mBGZF;
+ bool IsBigEndian;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMWRITER_P_H
BamIndex.cpp
BamMultiReader.cpp
BamReader.cpp
+ BamReader_p.cpp
BamWriter.cpp
+ BamWriter_p.cpp
BGZF.cpp
)