From: derek Date: Mon, 22 Nov 2010 18:02:38 +0000 (-0500) Subject: Moved private implementation API files to internal directory for clearer separation... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=369e2de20a6d939d07ffc09462167a3b688bbdde;p=bamtools.git Moved private implementation API files to internal directory for clearer separation. These should not be included by client code --- diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index 053f8e9..d0230b7 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 22 November 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -13,8 +13,8 @@ #include #include #include -#include -#include +#include +#include using namespace BamTools; using namespace BamTools::Internal; diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index 2650045..bd4bff9 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -3,13 +3,13 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 22 November 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** #include -#include +#include using namespace BamTools; using namespace BamTools::Internal; diff --git a/src/api/BamReader_p.cpp b/src/api/BamReader_p.cpp deleted file mode 100644 index 792be22..0000000 --- a/src/api/BamReader_p.cpp +++ /dev/null @@ -1,720 +0,0 @@ -// *************************************************************************** -// BamReader_p.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#include -#include -#include -#include -#include - -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -using namespace std; - -// constructor -BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : HeaderText("") - , Index(0) - , HasIndex(false) - , AlignmentsBeginOffset(0) -// , m_header(0) - , IndexCacheMode(BamIndex::LimitedIndexCaching) - , HasAlignmentsInRegion(true) - , Parent(parent) - , DNA_LOOKUP("=ACMGRSVTWYHKDBN") - , CIGAR_LOOKUP("MIDNSHP") -{ - IsBigEndian = SystemIsBigEndian(); -} - -// destructor -BamReaderPrivate::~BamReaderPrivate(void) { - Close(); -} - -// adjusts requested region if necessary (depending on where data actually begins) -void BamReaderPrivate::AdjustRegion(BamRegion& region) { - - // check for valid index first - if ( Index == 0 ) return; - - // see if any references in region have alignments - HasAlignmentsInRegion = false; - int currentId = region.LeftRefID; - while ( currentId <= region.RightRefID ) { - HasAlignmentsInRegion = Index->HasAlignments(currentId); - if ( HasAlignmentsInRegion ) break; - ++currentId; - } - - // if no data found on any reference in region - if ( !HasAlignmentsInRegion ) return; - - // if left bound of desired region had no data, use first reference that had data - // otherwise, leave requested region as-is - if ( currentId != region.LeftRefID ) { - region.LeftRefID = currentId; - region.LeftPosition = 0; - } -} - -// fills out character data for BamAlignment data -bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { - - // calculate character lengths/offsets - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; - const unsigned int tagDataLength = dataLength - tagDataOffset; - - // set up char buffers - const char* allCharData = bAlignment.SupportData.AllCharData.data(); - const char* seqData = ((const char*)allCharData) + seqDataOffset; - const char* qualData = ((const char*)allCharData) + qualDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; - - // store alignment name (relies on null char in name as terminator) - bAlignment.Name.assign((const char*)(allCharData)); - - // save query sequence - bAlignment.QueryBases.clear(); - bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; - bAlignment.QueryBases.append(1, singleBase); - } - - // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character - bAlignment.Qualities.clear(); - bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); - bAlignment.Qualities.append(1, singleQuality); - } - - // if QueryBases is empty (and this is a allowed case) - if ( bAlignment.QueryBases.empty() ) - bAlignment.AlignedBases = bAlignment.QueryBases; - - // if QueryBases contains data, then build AlignedBases using CIGAR data - else { - - // resize AlignedBases - bAlignment.AlignedBases.clear(); - bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); - - // iterate over CigarOps - int k = 0; - vector::const_iterator cigarIter = bAlignment.CigarData.begin(); - vector::const_iterator cigarEnd = bAlignment.CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - - const CigarOp& op = (*cigarIter); - switch(op.Type) { - - case ('M') : - case ('I') : - bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases - // fall through - - case ('S') : - k += op.Length; // for 'S' - soft clip, skip over query bases - break; - - case ('D') : - bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character - break; - - case ('P') : - bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character - break; - - case ('N') : - bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence - break; - - case ('H') : - break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op - - default: - fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - } - - // ----------------------- - // Added: 3-25-2010 DB - // Fixed: endian-correctness for tag data - // ----------------------- - if ( IsBigEndian ) { - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i += sizeof(uint64_t); - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here - exit(1); - } - } - } - - // store TagData - bAlignment.TagData.clear(); - bAlignment.TagData.resize(tagDataLength); - memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); - - // clear the core-only flag - bAlignment.SupportData.HasCoreOnly = false; - - // return success - return true; -} - -// clear index data structure -void BamReaderPrivate::ClearIndex(void) { - delete Index; - Index = 0; - HasIndex = false; -} - -// closes the BAM file -void BamReaderPrivate::Close(void) { - - // close BGZF file stream - mBGZF.Close(); - - // clear out index data - ClearIndex(); - - // clear out header data - HeaderText.clear(); -// if ( m_header ) { -// delete m_header; -// m_header = 0; -// } - - // clear out region flags - Region.clear(); -} - -// creates index for BAM file, saves to file -// default behavior is to create the BAM standard index (".bai") -// set flag to false to create the BamTools-specific index (".bti") -bool BamReaderPrivate::CreateIndex(bool useStandardIndex) { - - // clear out prior index data - ClearIndex(); - - // create index based on type requested - if ( useStandardIndex ) - Index = new BamStandardIndex(&mBGZF, Parent); - else - Index = new BamToolsIndex(&mBGZF, Parent); - - // set index cache mode to full for writing - Index->SetCacheMode(BamIndex::FullIndexCaching); - - // build new index - bool ok = true; - ok &= Index->Build(); - HasIndex = ok; - - // mark empty references - MarkReferences(); - - // attempt to save index data to file - ok &= Index->Write(Filename); - - // set client's desired index cache mode - Index->SetCacheMode(IndexCacheMode); - - // return success/fail of both building & writing index - return ok; -} - -const string BamReaderPrivate::GetHeaderText(void) const { - - return HeaderText; - -// if ( m_header ) -// return m_header->Text(); -// else -// return string(""); -} - -// get next alignment (from specified region, if given) -bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { - - // if valid alignment found, attempt to parse char data, and return success/failure - if ( GetNextAlignmentCore(bAlignment) ) - return BuildCharData(bAlignment); - - // no valid alignment found - else return false; -} - -// retrieves next available alignment core data (returns success/fail) -// ** DOES NOT parse any character data (read name, bases, qualities, tag data) -// these can be accessed, if necessary, from the supportData -// useful for operations requiring ONLY positional or other alignment-related information -bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { - - // if region is set but has no alignments - if ( !Region.isNull() && !HasAlignmentsInRegion ) - return false; - - // if valid alignment available - if ( LoadNextAlignment(bAlignment) ) { - - // set core-only flag - bAlignment.SupportData.HasCoreOnly = true; - - // if region not specified with at least a left boundary, return success - if ( !Region.isLeftBoundSpecified() ) return true; - - // determine region state (before, within, after) - BamReaderPrivate::RegionState state = IsOverlap(bAlignment); - - // if alignment lies after region, return false - if ( state == AFTER_REGION ) return false; - - while ( state != WITHIN_REGION ) { - // if no valid alignment available (likely EOF) return failure - if ( !LoadNextAlignment(bAlignment) ) return false; - // if alignment lies after region, return false (no available read within region) - state = IsOverlap(bAlignment); - if ( state == AFTER_REGION ) return false; - } - - // return success (alignment found that overlaps region) - return true; - } - - // no valid alignment - else return false; -} - -// returns RefID for given RefName (returns References.size() if not found) -int BamReaderPrivate::GetReferenceID(const string& refName) const { - - // retrieve names from reference data - vector refNames; - RefVector::const_iterator refIter = References.begin(); - RefVector::const_iterator refEnd = References.end(); - for ( ; refIter != refEnd; ++refIter) - refNames.push_back( (*refIter).RefName ); - - // return 'index-of' refName ( if not found, returns refNames.size() ) - return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); -} - -// returns region state - whether alignment ends before, overlaps, or starts after currently specified region -// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true -BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { - - // if alignment is on any reference sequence before left bound - if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION; - - // if alignment starts on left bound reference - else if ( bAlignment.RefID == Region.LeftRefID ) { - - // if alignment starts at or after left boundary - if ( bAlignment.Position >= Region.LeftPosition) { - - // if right boundary is specified AND - // left/right boundaries are on same reference AND - // alignment starts past right boundary - if ( Region.isRightBoundSpecified() && - Region.LeftRefID == Region.RightRefID && - bAlignment.Position > Region.RightPosition ) - return AFTER_REGION; - - // otherwise, alignment is within region - return WITHIN_REGION; - } - - // alignment starts before left boundary - else { - // check if alignment overlaps left boundary - if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION; - else return BEFORE_REGION; - } - } - - // alignment starts on a reference after the left bound - else { - - // if region has a right boundary - if ( Region.isRightBoundSpecified() ) { - - // alignment is on reference between boundaries - if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION; - - // alignment is on reference after right boundary - else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION; - - // alignment is on right bound reference - else { - // check if alignment starts before or at right boundary - if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION; - else return AFTER_REGION; - } - } - - // otherwise, alignment is after left bound reference, but there is no right boundary - else return WITHIN_REGION; - } -} - -// load BAM header data -void BamReaderPrivate::LoadHeaderData(void) { - -// m_header = new BamHeader(&mBGZF); -// bool headerLoadedOk = m_header->Load(); -// if ( !headerLoadedOk ) -// cerr << "BamReader could not load header" << endl; - - // check to see if proper BAM header - char buffer[4]; - if (mBGZF.Read(buffer, 4) != 4) { - fprintf(stderr, "Could not read header type\n"); - exit(1); - } - - if (strncmp(buffer, "BAM\001", 4)) { - fprintf(stderr, "wrong header type!\n"); - exit(1); - } - - // get BAM header text length - mBGZF.Read(buffer, 4); - unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(headerTextLength); - - // get BAM header text - char* headerText = (char*)calloc(headerTextLength + 1, 1); - mBGZF.Read(headerText, headerTextLength); - HeaderText = (string)((const char*)headerText); - - // clean up calloc-ed temp variable - free(headerText); -} - -// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail -bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) { - - // clear out any existing index data - ClearIndex(); - - // if no index filename provided, so we need to look for available index files - if ( IndexFilename.empty() ) { - - // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag - const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); - Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type); - - // if null, return failure - if ( Index == 0 ) return false; - - // generate proper IndexFilename based on type of index created - IndexFilename = Filename + Index->Extension(); - } - - else { - - // attempt to load BamIndex based on IndexFilename provided by client - Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent); - - // if null, return failure - if ( Index == 0 ) return false; - } - - // set cache mode for BamIndex - Index->SetCacheMode(IndexCacheMode); - - // loading the index data from file - HasIndex = Index->Load(IndexFilename); - - // mark empty references - MarkReferences(); - - // return index status - return HasIndex; -} - -// populates BamAlignment with alignment data under file pointer, returns success/fail -bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { - - // read in the 'block length' value, make sure it's not zero - char buffer[4]; - mBGZF.Read(buffer, 4); - bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } - if ( bAlignment.SupportData.BlockLength == 0 ) return false; - - // read in core alignment data, make sure the right size of data was read - char x[BAM_CORE_SIZE]; - if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; - - if ( IsBigEndian ) { - for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) - SwapEndian_32p(&x[i]); - } - - // set BamAlignment 'core' and 'support' data - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); - bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); - - unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); - bAlignment.Bin = tempValue >> 16; - bAlignment.MapQuality = tempValue >> 8 & 0xff; - bAlignment.SupportData.QueryNameLength = tempValue & 0xff; - - tempValue = BgzfData::UnpackUnsignedInt(&x[12]); - bAlignment.AlignmentFlag = tempValue >> 16; - bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; - - bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); - bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); - bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); - bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - - // set BamAlignment length - bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; - - // read in character data - make sure proper data size was read - bool readCharDataOK = false; - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - char* allCharData = (char*)calloc(sizeof(char), dataLength); - - if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { - - // store 'allCharData' in supportData structure - bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); - - // set success flag - readCharDataOK = true; - - // save CIGAR ops - // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, - // even when GetNextAlignmentCore() is called - const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); - CigarOp op; - bAlignment.CigarData.clear(); - bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); - for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { - - // swap if necessary - if ( IsBigEndian ) SwapEndian_32(cigarData[i]); - - // build CigarOp structure - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - - // save CigarOp - bAlignment.CigarData.push_back(op); - } - } - - free(allCharData); - return readCharDataOK; -} - -// loads reference data from BAM file -void BamReaderPrivate::LoadReferenceData(void) { - - // get number of reference sequences - char buffer[4]; - mBGZF.Read(buffer, 4); - unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(numberRefSeqs); - if ( numberRefSeqs == 0 ) return; - References.reserve((int)numberRefSeqs); - - // iterate over all references in header - for (unsigned int i = 0; i != numberRefSeqs; ++i) { - - // get length of reference name - mBGZF.Read(buffer, 4); - unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(refNameLength); - char* refName = (char*)calloc(refNameLength, 1); - - // get reference name and reference sequence length - mBGZF.Read(refName, refNameLength); - mBGZF.Read(buffer, 4); - int refLength = BgzfData::UnpackSignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(refLength); - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName); - aReference.RefLength = refLength; - References.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); - } -} - -// mark references with no alignment data -void BamReaderPrivate::MarkReferences(void) { - - // ensure index is available - if ( !HasIndex ) return; - - // mark empty references - for ( int i = 0; i < (int)References.size(); ++i ) - References.at(i).RefHasAlignments = Index->HasAlignments(i); -} - -// opens BAM file (and index) -bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) { - - // store filenames - Filename = filename; - IndexFilename = indexFilename; - - // open the BGZF file for reading, return false on failure - if ( !mBGZF.Open(filename, "rb") ) return false; - - // retrieve header text & reference data - LoadHeaderData(); - LoadReferenceData(); - - // store file offset of first alignment - AlignmentsBeginOffset = mBGZF.Tell(); - - // if no index filename provided - if ( IndexFilename.empty() ) { - - // client did not specify that index SHOULD be found - // useful for cases where sequential access is all that is required - if ( !lookForIndex ) return true; - - // otherwise, look for index file, return success/fail - return LoadIndex(lookForIndex, preferStandardIndex) ; - } - - // client supplied an index filename - // attempt to load index data, return success/fail - return LoadIndex(lookForIndex, preferStandardIndex); -} - -// returns BAM file pointer to beginning of alignment data -bool BamReaderPrivate::Rewind(void) { - - // rewind to first alignment, return false if unable to seek - if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; - - // retrieve first alignment data, return false if unable to read - BamAlignment al; - if ( !LoadNextAlignment(al) ) return false; - - // reset default region info using first alignment in file - Region.clear(); - HasAlignmentsInRegion = true; - - // rewind back to beginning of first alignment - // return success/fail of seek - return mBGZF.Seek(AlignmentsBeginOffset); -} - -// change the index caching behavior -void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { - IndexCacheMode = mode; - if ( Index == 0 ) return; - Index->SetCacheMode(mode); -} - -// asks Index to attempt a Jump() to specified region -// returns success/failure -bool BamReaderPrivate::SetRegion(const BamRegion& region) { - - // clear out any prior BamReader region data - // - // N.B. - this is cleared so that BamIndex now has free reign to call - // GetNextAlignmentCore() and do overlap checking without worrying about BamReader - // performing any overlap checking of its own and moving on to the next read... Calls - // to GetNextAlignmentCore() with no Region set, simply return the next alignment. - // This ensures that the Index is able to do just that. (All without exposing - // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature) - Region.clear(); - - // check for existing index - if ( !HasIndex ) return false; - - // adjust region if necessary to reflect where data actually begins - BamRegion adjustedRegion(region); - AdjustRegion(adjustedRegion); - - // if no data present, return true - // not an error, but BamReader knows that no data is there for future alignment access - // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions - // that other BAMs have data) - if ( !HasAlignmentsInRegion ) { - Region = adjustedRegion; - return true; - } - - // attempt jump to user-specified region return false if jump could not be performed at all - // (invalid index, unknown reference, etc) - // - // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag - // * This covers case where a region is requested that lies beyond the last alignment on a reference - // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false - // BamMultiReader is then able to successfully pull alignments from a region from multiple files - // even if one or more have no data. - if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false; - - // save region and return success - Region = adjustedRegion; - return true; -} diff --git a/src/api/BamReader_p.h b/src/api/BamReader_p.h deleted file mode 100644 index 8c3172f..0000000 --- a/src/api/BamReader_p.h +++ /dev/null @@ -1,137 +0,0 @@ -// *************************************************************************** -// BamReader_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#ifndef BAMREADER_P_H -#define BAMREADER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include -#include -#include -#include - -namespace BamTools { - -class BamReader; - -namespace Internal { - -class BamReaderPrivate { - - // enums - public: enum RegionState { BEFORE_REGION = 0 - , WITHIN_REGION - , AFTER_REGION - }; - - // ctor & dtor - public: - BamReaderPrivate(BamReader* parent); - ~BamReaderPrivate(void); - - // 'public' interface to BamReader - public: - - // file operations - void Close(void); - bool Open(const std::string& filename, - const std::string& indexFilename, - const bool lookForIndex, - const bool preferStandardIndex); - bool Rewind(void); - bool SetRegion(const BamRegion& region); - - // access alignment data - bool GetNextAlignment(BamAlignment& bAlignment); - bool GetNextAlignmentCore(BamAlignment& bAlignment); - - // access auxiliary data - const std::string GetHeaderText(void) const; - int GetReferenceID(const std::string& refName) const; - - // index operations - bool CreateIndex(bool useStandardIndex); - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - - // 'internal' methods - public: - - // --------------------------------------- - // reading alignments and auxiliary data - - // adjusts requested region if necessary (depending on where data actually begins) - void AdjustRegion(BamRegion& region); - // fills out character data for BamAlignment data - bool BuildCharData(BamAlignment& bAlignment); - // checks to see if alignment overlaps current region - RegionState IsOverlap(BamAlignment& bAlignment); - // retrieves header text from BAM file - void LoadHeaderData(void); - // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); - // builds reference data structure from BAM file - void LoadReferenceData(void); - // mark references with 'HasAlignments' status - void MarkReferences(void); - - // --------------------------------- - // index file handling - - // clear out inernal index data structure - void ClearIndex(void); - // loads index from BAM index file - bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); - - // data members - public: - - // general file data - BgzfData mBGZF; - std::string HeaderText; - BamIndex* Index; - RefVector References; - bool HasIndex; - int64_t AlignmentsBeginOffset; - std::string Filename; - std::string IndexFilename; - -// Internal::BamHeader* m_header; - - // index caching mode - BamIndex::BamIndexCacheMode IndexCacheMode; - - // system data - bool IsBigEndian; - - // user-specified region values - BamRegion Region; - bool HasAlignmentsInRegion; - - // parent BamReader - BamReader* Parent; - - // BAM character constants - const char* DNA_LOOKUP; - const char* CIGAR_LOOKUP; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMREADER_P_H diff --git a/src/api/BamStandardIndex.cpp b/src/api/BamStandardIndex.cpp deleted file mode 100644 index 28db076..0000000 --- a/src/api/BamStandardIndex.cpp +++ /dev/null @@ -1,910 +0,0 @@ -// *************************************************************************** -// BamStandardIndex.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the standardized BAM index format (".bai") -// *************************************************************************** - -#include -#include -#include -#include -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -#include -using namespace std; - -BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) - : BamIndex(bgzf, reader) - , m_dataBeginOffset(0) - , m_hasFullDataCache(false) -{ - m_isBigEndian = BamTools::SystemIsBigEndian(); -} - -BamStandardIndex::~BamStandardIndex(void) { - ClearAllData(); -} - -// calculate bins that overlap region -int BamStandardIndex::BinsFromRegion(const BamRegion& region, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]) -{ - // get region boundaries - uint32_t begin = (unsigned int)region.LeftPosition; - uint32_t end; - - // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position - if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) - end = (unsigned int)region.RightPosition; - - // otherwise, use end of left bound reference as cutoff - else - end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - bins[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } - - // return number of bins stored - return i; -} - -// creates index data (in-memory) from current reader data -bool BamStandardIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - m_reader->Rewind(); - - // get reference count, reserve index space - const int numReferences = (int)m_references.size(); - m_indexData.clear(); - m_hasFullDataCache = false; - SetReferenceCount(numReferences); - - // sets default constant for bin, ID, offset, coordinate variables - const uint32_t defaultValue = 0xffffffffu; - - // bin data - uint32_t saveBin(defaultValue); - uint32_t lastBin(defaultValue); - - // reference ID data - int32_t saveRefID(defaultValue); - int32_t lastRefID(defaultValue); - - // offset data - uint64_t saveOffset = m_BGZF->Tell(); - uint64_t lastOffset = saveOffset; - - // coordinate data - int32_t lastCoordinate = defaultValue; - - BamAlignment bAlignment; - while ( m_reader->GetNextAlignmentCore(bAlignment) ) { - - // change of chromosome, save ID, reset bin - if ( lastRefID != bAlignment.RefID ) { - lastRefID = bAlignment.RefID; - lastBin = defaultValue; - } - - // if lastCoordinate greater than BAM position - file not sorted properly - else if ( lastCoordinate > bAlignment.Position ) { - fprintf(stderr, "BAM file not properly sorted:\n"); - fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), - lastCoordinate, bAlignment.Position, bAlignment.RefID); - exit(1); - } - - // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) - if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { - - // save linear offset entry (matched to BAM entry refID) - BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - LinearOffsetVector& offsets = refIndex.Offsets; - SaveLinearOffset(offsets, bAlignment, lastOffset); - } - - // if current BamAlignment bin != lastBin, "then possibly write the binning index" - if ( bAlignment.Bin != lastBin ) { - - // if not first time through - if ( saveBin != defaultValue ) { - - // save Bam bin entry - BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& binMap = refIndex.Bins; - SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // update saveOffset - saveOffset = lastOffset; - - // update bin values - saveBin = bAlignment.Bin; - lastBin = bAlignment.Bin; - - // update saveRefID - saveRefID = bAlignment.RefID; - - // if invalid RefID, break out - if ( saveRefID < 0 ) break; - } - - // make sure that current file pointer is beyond lastOffset - if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { - fprintf(stderr, "Error in BGZF offsets.\n"); - exit(1); - } - - // update lastOffset - lastOffset = m_BGZF->Tell(); - - // update lastCoordinate - lastCoordinate = bAlignment.Position; - } - - // save any leftover BAM data (as long as refID is valid) - if ( saveRefID >= 0 ) { - // save Bam bin entry - BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& binMap = refIndex.Bins; - SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // simplify index by merging chunks - MergeChunks(); - - // iterate through references in index - // sort offsets in linear offset vector - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { - - // get reference index data - ReferenceIndex& refIndex = (*indexIter).second; - LinearOffsetVector& offsets = refIndex.Offsets; - - // sort linear offsets - sort(offsets.begin(), offsets.end()); - } - - // rewind file pointer to beginning of alignments, return success/fail - return m_reader->Rewind(); -} - -// check index file magic number, return true if OK -bool BamStandardIndex::CheckMagicNumber(void) { - - // read in magic number - char magic[4]; - size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); - - // compare to expected value - if ( strncmp(magic, "BAI\1", 4) != 0 ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - fclose(m_indexStream); - return false; - } - - // return success/failure of load - return (elementsRead == 4); -} - -// clear all current index offset data in memory -void BamStandardIndex::ClearAllData(void) { - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const int& refId = (*indexIter).first; - ClearReferenceOffsets(refId); - } -} - -// clear all index offset data for desired reference -void BamStandardIndex::ClearReferenceOffsets(const int& refId) { - - // look up refId, skip if not found - BamStandardIndexData::iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return ; - - // clear reference data - ReferenceIndex& refEntry = (*indexIter).second; - refEntry.Bins.clear(); - refEntry.Offsets.clear(); - - // set flag - m_hasFullDataCache = false; -} - -// return file position after header metadata -const off_t BamStandardIndex::DataBeginOffset(void) const { - return m_dataBeginOffset; -} - -// calculates offset(s) for a given region -bool BamStandardIndex::GetOffsets(const BamRegion& region, - const bool isRightBoundSpecified, - vector& offsets, - bool* hasAlignmentsInRegion) -{ - // return false if leftBound refID is not found in index data - if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) - return false; - - // load index data for region if not already cached - if ( !IsDataLoaded(region.LeftRefID) ) { - bool loadedOk = true; - loadedOk &= SkipToReference(region.LeftRefID); - loadedOk &= LoadReference(region.LeftRefID); - if ( !loadedOk ) return false; - } - - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); - - // get bins for this reference - BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end() ) return false; // error - const ReferenceIndex& refIndex = (*indexIter).second; - const BamBinMap& binMap = refIndex.Bins; - - // get minimum offset to consider - const LinearOffsetVector& linearOffsets = refIndex.Offsets; - const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) - ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); - - // store all alignment 'chunk' starts (file offsets) for bins in this region - for ( int i = 0; i < numBins; ++i ) { - - const uint16_t binKey = bins[i]; - map::const_iterator binIter = binMap.find(binKey); - if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { - - // iterate over chunks - const ChunkVector& chunks = (*binIter).second; - std::vector::const_iterator chunksIter = chunks.begin(); - std::vector::const_iterator chunksEnd = chunks.end(); - for ( ; chunksIter != chunksEnd; ++chunksIter) { - - // if valid chunk found, store its file offset - const Chunk& chunk = (*chunksIter); - if ( chunk.Stop > minOffset ) - offsets.push_back( chunk.Start ); - } - } - } - - // clean up memory - free(bins); - - // sort the offsets before returning - sort(offsets.begin(), offsets.end()); - - // set flag & return success - *hasAlignmentsInRegion = (offsets.size() != 0 ); - - // if cache mode set to none, dump the data we just loaded - if (m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); - - // return succes - return true; -} - -// returns whether reference has alignments or no -bool BamStandardIndex::HasAlignments(const int& refId) const { - BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return false; // error - const ReferenceIndex& refEntry = (*indexIter).second; - return refEntry.HasAlignments; -} - -// return true if all index data is cached -bool BamStandardIndex::HasFullDataCache(void) const { - return m_hasFullDataCache; -} - -// returns true if index cache has data for desired reference -bool BamStandardIndex::IsDataLoaded(const int& refId) const { - - // look up refId, return false if not found - BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return false; - - // see if reference has alignments - // if not, it's not a problem to have no offset data - const ReferenceIndex& refEntry = (*indexIter).second; - if ( !refEntry.HasAlignments ) return true; - - // return whether bin map contains data - return ( !refEntry.Bins.empty() ); -} - -// attempts to use index to jump to region; returns success/fail -bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) - return false; - - // calculate offsets for this region - // if failed, print message, set flag, and return failure - vector offsets; - if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { - fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n"); - *hasAlignmentsInRegion = false; - return false; - } - - // iterate through offsets - BamAlignment bAlignment; - bool result = true; - for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - - // attempt seek & load first available alignment - // set flag to true if data exists - result &= m_BGZF->Seek(*o); - *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment); - - // if this alignment corresponds to desired position - // return success of seeking back to the offset before the 'current offset' (to cover overlaps) - if ( ((bAlignment.RefID == region.LeftRefID) && - ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || - (bAlignment.RefID > region.LeftRefID) ) - { - if ( o != offsets.begin() ) --o; - return m_BGZF->Seek(*o); - } - } - - // if error in jumping, print message & set flag - if ( !result ) { - fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n"); - *hasAlignmentsInRegion = false; - } - - // return success/failure - return result; -} - -// clears index data from all references except the first -void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - KeepOnlyReferenceOffsets((*indexBegin).first); -} - -// clears index data from all references except the one specified -void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) { - BamStandardIndexData::iterator mapIter = m_indexData.begin(); - BamStandardIndexData::iterator mapEnd = m_indexData.end(); - for ( ; mapIter != mapEnd; ++mapIter ) { - const int entryRefId = (*mapIter).first; - if ( entryRefId != refId ) - ClearReferenceOffsets(entryRefId); - } -} - -bool BamStandardIndex::LoadAllReferences(bool saveData) { - - // skip if data already loaded - if ( m_hasFullDataCache ) return true; - - // get number of reference sequences - uint32_t numReferences; - if ( !LoadReferenceCount((int&)numReferences) ) - return false; - - // iterate over reference entries - bool loadedOk = true; - for ( int i = 0; i < (int)numReferences; ++i ) - loadedOk &= LoadReference(i, saveData); - - // set flag - if ( loadedOk && saveData ) - m_hasFullDataCache = true; - - // return success/failure of loading references - return loadedOk; -} - -// load header data from index file, return true if loaded OK -bool BamStandardIndex::LoadHeader(void) { - - bool loadedOk = CheckMagicNumber(); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of load - return loadedOk; -} - -// load a single index bin entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) { - - size_t elementsRead = 0; - - // get bin ID - uint32_t binId; - elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(binId); - - // load alignment chunks for this bin - ChunkVector chunks; - bool chunksOk = LoadChunks(chunks, saveData); - - // store bin entry - if ( chunksOk && saveData ) - refEntry.Bins.insert(pair(binId, chunks)); - - // return success/failure of load - return ( (elementsRead == 1) && chunksOk ); -} - -bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) { - - size_t elementsRead = 0; - - // get number of bins - int32_t numBins; - elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numBins); - - // set flag - refEntry.HasAlignments = ( numBins != 0 ); - - // iterate over bins - bool binsOk = true; - for ( int i = 0; i < numBins; ++i ) - binsOk &= LoadBin(refEntry, saveData); - - // return success/failure of load - return ( (elementsRead == 1) && binsOk ); -} - -// load a single index bin entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) { - - size_t elementsRead = 0; - - // read in chunk data - uint64_t start; - uint64_t stop; - elementsRead += fread(&start, sizeof(start), 1, m_indexStream); - elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream); - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save data if requested - if ( saveData ) chunks.push_back( Chunk(start, stop) ); - - // return success/failure of load - return ( elementsRead == 2 ); -} - -bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) { - - size_t elementsRead = 0; - - // read in number of chunks - uint32_t numChunks; - elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numChunks); - - // initialize space for chunks if we're storing this data - if ( saveData ) chunks.reserve(numChunks); - - // iterate over chunks - bool chunksOk = true; - for ( int i = 0; i < (int)numChunks; ++i ) - chunksOk &= LoadChunk(chunks, saveData); - - // sort chunk vector - sort( chunks.begin(), chunks.end(), ChunkLessThan ); - - // return success/failure of load - return ( (elementsRead == 1) && chunksOk ); -} - -// load a single index linear offset entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { - - size_t elementsRead = 0; - - // read in number of linear offsets - int32_t numLinearOffsets; - elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); - - // set up destination vector (if we're saving the data) - LinearOffsetVector linearOffsets; - if ( saveData ) linearOffsets.reserve(numLinearOffsets); - - // iterate over linear offsets - uint64_t linearOffset; - for ( int i = 0; i < numLinearOffsets; ++i ) { - elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); - if ( saveData ) linearOffsets.push_back(linearOffset); - } - - // sort linear offsets - sort ( linearOffsets.begin(), linearOffsets.end() ); - - // save in reference index entry if desired - if ( saveData ) refEntry.Offsets = linearOffsets; - - // return success/failure of load - return ( elementsRead == (size_t)(numLinearOffsets + 1) ); -} - -bool BamStandardIndex::LoadFirstReference(bool saveData) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - return LoadReference((*indexBegin).first, saveData); -} - -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadReference(const int& refId, bool saveData) { - - // look up refId - BamStandardIndexData::iterator indexIter = m_indexData.find(refId); - - // if reference not previously loaded, create new entry - if ( indexIter == m_indexData.end() ) { - ReferenceIndex newEntry; - newEntry.HasAlignments = false; - m_indexData.insert( pair(refId, newEntry) ); - } - - // load reference data - indexIter = m_indexData.find(refId); - ReferenceIndex& entry = (*indexIter).second; - bool loadedOk = true; - loadedOk &= LoadBins(entry, saveData); - loadedOk &= LoadLinearOffsets(entry, saveData); - return loadedOk; -} - -// loads number of references, return true if loaded OK -bool BamStandardIndex::LoadReferenceCount(int& numReferences) { - - size_t elementsRead = 0; - - // read reference count - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // return success/failure of load - return ( elementsRead == 1 ); -} - -// merges 'alignment chunks' in BAM bin (used for index building) -void BamStandardIndex::MergeChunks(void) { - - // iterate over reference enties - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& bamBinMap = refIndex.Bins; - - // iterate over BAM bins - BamBinMap::iterator binIter = bamBinMap.begin(); - BamBinMap::iterator binEnd = bamBinMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get chunk vector for this bin - ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) continue; - - ChunkVector mergedChunks; - mergedChunks.push_back( binChunks[0] ); - - // iterate over chunks - int i = 0; - ChunkVector::iterator chunkIter = binChunks.begin(); - ChunkVector::iterator chunkEnd = binChunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentChunk' based on numeric index - Chunk& currentChunk = mergedChunks[i]; - - // get iteratorChunk based on vector iterator - Chunk& iteratorChunk = (*chunkIter); - - // if chunk ends where (iterator) chunk starts, then merge - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) - currentChunk.Stop = iteratorChunk.Stop; - - // otherwise - else { - // set currentChunk + 1 to iteratorChunk - mergedChunks.push_back(iteratorChunk); - ++i; - } - } - - // saved merged chunk vector - (*binIter).second = mergedChunks; - } - } -} - -// saves BAM bin entry for index -void BamStandardIndex::SaveBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) -{ - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); - - // if entry doesn't exist - if ( binIter == binMap.end() ) { - ChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair(saveBin, newChunks)); - } - - // otherwise - else { - ChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } -} - -// saves linear offset entry for index -void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) -{ - // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) - offsets.resize(newSize, 0); - - // store offset - for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) - offsets[i] = lastOffset; - } -} - -// initializes index data structure to hold @count references -void BamStandardIndex::SetReferenceCount(const int& count) { - for ( int i = 0; i < count; ++i ) - m_indexData[i].HasAlignments = false; -} - -bool BamStandardIndex::SkipToFirstReference(void) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - return SkipToReference( (*indexBegin).first ); -} - -// position file pointer to desired reference begin, return true if skipped OK -bool BamStandardIndex::SkipToReference(const int& refId) { - - // attempt rewind - if ( !Rewind() ) return false; - - // read in number of references - uint32_t numReferences; - size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // iterate over reference entries - bool skippedOk = true; - int currentRefId = 0; - while (currentRefId != refId) { - skippedOk &= LoadReference(currentRefId, false); - ++currentRefId; - } - - // return success - return skippedOk; -} - -// write header to new index file -bool BamStandardIndex::WriteHeader(void) { - - size_t elementsWritten = 0; - - // write magic number - elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of write - return (elementsWritten == 4); -} - -// write index data for all references to new index file -bool BamStandardIndex::WriteAllReferences(void) { - - size_t elementsWritten = 0; - - // write number of reference sequences - int32_t numReferenceSeqs = m_indexData.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); - elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream); - - // iterate over reference sequences - bool refsOk = true; - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) - refsOk &= WriteReference( (*indexIter).second ); - - // return success/failure of write - return ( (elementsWritten == 1) && refsOk ); -} - -// write index data for bin to new index file -bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { - - size_t elementsWritten = 0; - - // write BAM bin ID - uint32_t binKey = binId; - if ( m_isBigEndian ) SwapEndian_32(binKey); - elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); - - // write chunks - bool chunksOk = WriteChunks(chunks); - - // return success/failure of write - return ( (elementsWritten == 1) && chunksOk ); -} - -// write index data for bins to new index file -bool BamStandardIndex::WriteBins(const BamBinMap& bins) { - - size_t elementsWritten = 0; - - // write number of bins - int32_t binCount = bins.size(); - if ( m_isBigEndian ) SwapEndian_32(binCount); - elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); - - // iterate over bins - bool binsOk = true; - BamBinMap::const_iterator binIter = bins.begin(); - BamBinMap::const_iterator binEnd = bins.end(); - for ( ; binIter != binEnd; ++binIter ) - binsOk &= WriteBin( (*binIter).first, (*binIter).second ); - - // return success/failure of write - return ( (elementsWritten == 1) && binsOk ); -} - -// write index data for chunk entry to new index file -bool BamStandardIndex::WriteChunk(const Chunk& chunk) { - - size_t elementsWritten = 0; - - // localize alignment chunk offsets - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // write to index file - elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); - elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); - - // return success/failure of write - return ( elementsWritten == 2 ); -} - -// write index data for chunk entry to new index file -bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { - - size_t elementsWritten = 0; - - // write chunks - int32_t chunkCount = chunks.size(); - if ( m_isBigEndian ) SwapEndian_32(chunkCount); - elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); - - // iterate over chunks - bool chunksOk = true; - ChunkVector::const_iterator chunkIter = chunks.begin(); - ChunkVector::const_iterator chunkEnd = chunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) - chunksOk &= WriteChunk( (*chunkIter) ); - - // return success/failure of write - return ( (elementsWritten == 1) && chunksOk ); -} - -// write index data for linear offsets entry to new index file -bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { - - size_t elementsWritten = 0; - - // write number of linear offsets - int32_t offsetCount = offsets.size(); - if ( m_isBigEndian ) SwapEndian_32(offsetCount); - elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); - - // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // write linear offset - uint64_t linearOffset = (*offsetIter); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); - elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); - } - - // return success/failure of write - return ( elementsWritten == (size_t)(offsetCount + 1) ); -} - -// write index data for a single reference to new index file -bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) { - bool refOk = true; - refOk &= WriteBins(refEntry.Bins); - refOk &= WriteLinearOffsets(refEntry.Offsets); - return refOk; -} diff --git a/src/api/BamStandardIndex.h b/src/api/BamStandardIndex.h deleted file mode 100644 index da179f4..0000000 --- a/src/api/BamStandardIndex.h +++ /dev/null @@ -1,213 +0,0 @@ -// *************************************************************************** -// BamStandardIndex.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the standardized BAM index format (".bai") -// *************************************************************************** - -#ifndef BAM_STANDARD_INDEX_FORMAT_H -#define BAM_STANDARD_INDEX_FORMAT_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include -#include -#include -#include -#include - -namespace BamTools { - -class BamAlignment; - -namespace Internal { - -// BAM index constants -const int MAX_BIN = 37450; // =(8^6-1)/7+1 -const int BAM_LIDX_SHIFT = 14; - -// -------------------------------------------------- -// BamStandardIndex data structures & typedefs -struct Chunk { - - // data members - uint64_t Start; - uint64_t Stop; - - // constructor - Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) - { } -}; - -inline -bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { - return lhs.Start < rhs.Start; -} - -typedef std::vector ChunkVector; -typedef std::map BamBinMap; -typedef std::vector LinearOffsetVector; - -struct ReferenceIndex { - - // data members - BamBinMap Bins; - LinearOffsetVector Offsets; - bool HasAlignments; - - // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector(), - const bool hasAlignments = false) - : Bins(binMap) - , Offsets(offsets) - , HasAlignments(hasAlignments) - { } -}; - -typedef std::map BamStandardIndexData; - -class BamStandardIndex : public BamIndex { - - // ctor & dtor - public: - BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); - ~BamStandardIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const std::string Extension(void) const { return std::string(".bai"); } - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - const off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); - - // 'internal' methods - public: - - // ----------------------- - // index file operations - - // check index file magic number, return true if OK - bool CheckMagicNumber(void); - // check index file version, return true if OK - bool CheckVersion(void); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); - bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadChunk(ChunkVector& chunks, bool saveData = true); - bool LoadChunks(ChunkVector& chunks, bool saveData = true); - // load a single index linear offset entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write index data for bin to new index file - bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); - // write index data for bins to new index file - bool WriteBins(const BamBinMap& bins); - // write index data for chunk entry to new index file - bool WriteChunk(const Chunk& chunk); - // write index data for chunk entry to new index file - bool WriteChunks(const ChunkVector& chunks); - // write index data for linear offsets entry to new index file - bool WriteLinearOffsets(const LinearOffsetVector& offsets); - // write index data single reference to new index file - bool WriteReference(const ReferenceIndex& refEntry); - - // ----------------------- - // index data operations - - // calculate bins that overlap region - int BinsFromRegion(const BamRegion& region, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]); - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculates offset(s) for a given region - bool GetOffsets(const BamRegion& region, - const bool isRightBoundSpecified, - std::vector& offsets, - bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // simplifies index by merging 'chunks' - void MergeChunks(void); - // saves BAM bin entry for index - void SaveBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset); - // saves linear offset entry for index - void SaveLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); - - // data members - private: - - BamStandardIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; - bool m_isBigEndian; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAM_STANDARD_INDEX_FORMAT_H diff --git a/src/api/BamToolsIndex.cpp b/src/api/BamToolsIndex.cpp deleted file mode 100644 index 3bb314b..0000000 --- a/src/api/BamToolsIndex.cpp +++ /dev/null @@ -1,577 +0,0 @@ -// *************************************************************************** -// BamToolsIndex.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the BamTools index format (".bti") -// *************************************************************************** - -#include -#include -#include -#include -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -#include -using namespace std; - -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader) - : BamIndex(bgzf, reader) - , m_blockSize(1000) - , m_dataBeginOffset(0) - , m_hasFullDataCache(false) - , m_inputVersion(0) - , m_outputVersion(BTI_1_2) // latest version - used for writing new index files -{ - m_isBigEndian = BamTools::SystemIsBigEndian(); -} - -// dtor -BamToolsIndex::~BamToolsIndex(void) { - ClearAllData(); -} - -// creates index data (in-memory) from current reader data -bool BamToolsIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - if ( !m_reader->Rewind() ) return false; - - // initialize index data structure with space for all references - const int numReferences = (int)m_references.size(); - m_indexData.clear(); - m_hasFullDataCache = false; - SetReferenceCount(numReferences); - - // set up counters and markers - int32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = m_BGZF->Tell(); - int32_t blockRefId = 0; - int32_t blockMaxEndPosition = 0; - int64_t blockStartOffset = currentAlignmentOffset; - int32_t blockStartPosition = -1; - - // plow through alignments, storing index entries - BamAlignment al; - while ( m_reader->GetNextAlignmentCore(al) ) { - - // if block contains data (not the first time through) AND alignment is on a new reference - if ( currentBlockCount > 0 && al.RefID != blockRefId ) { - - // store previous data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - - // intialize new block for current alignment's reference - currentBlockCount = 0; - blockMaxEndPosition = al.GetEndPosition(); - blockStartOffset = currentAlignmentOffset; - } - - // if beginning of block, save first alignment's refID & position - if ( currentBlockCount == 0 ) { - blockRefId = al.RefID; - blockStartPosition = al.Position; - } - - // increment block counter - ++currentBlockCount; - - // check end position - int32_t alignmentEndPosition = al.GetEndPosition(); - if ( alignmentEndPosition > blockMaxEndPosition ) - blockMaxEndPosition = alignmentEndPosition; - - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == m_blockSize ) { - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - blockStartOffset = m_BGZF->Tell(); - currentBlockCount = 0; - } - - // not the best name, but for the next iteration, this value will be the offset of the *current* alignment - // necessary because we won't know if this next alignment is on a new reference until we actually read it - currentAlignmentOffset = m_BGZF->Tell(); - } - - // store final block with data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - - // set flag - m_hasFullDataCache = true; - - // return success/failure of rewind - return m_reader->Rewind(); -} - -// check index file magic number, return true if OK -bool BamToolsIndex::CheckMagicNumber(void) { - - // see if index is valid BAM index - char magic[4]; - size_t elementsRead = fread(magic, 1, 4, m_indexStream); - if ( elementsRead != 4 ) return false; - if ( strncmp(magic, "BTI\1", 4) != 0 ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - return false; - } - - // otherwise ok - return true; -} - -// check index file version, return true if OK -bool BamToolsIndex::CheckVersion(void) { - - // read version from file - size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); - - // if version is negative, or zero - if ( m_inputVersion <= 0 ) { - fprintf(stderr, "Problem with index file - invalid version.\n"); - return false; - } - - // if version is newer than can be supported by this version of bamtools - else if ( m_inputVersion > m_outputVersion ) { - fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n"); - fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); - return false; - } - - // ------------------------------------------------------------------ - // check for deprecated, unsupported versions - // (typically whose format did not accomodate a particular bug fix) - - else if ( (Version)m_inputVersion == BTI_1_0 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); - return false; - } - - else if ( (Version)m_inputVersion == BTI_1_1 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); - return false; - } - - // otherwise ok - else return true; -} - -// clear all current index offset data in memory -void BamToolsIndex::ClearAllData(void) { - BamToolsIndexData::const_iterator indexIter = m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const int& refId = (*indexIter).first; - ClearReferenceOffsets(refId); - } -} - -// clear all index offset data for desired reference -void BamToolsIndex::ClearReferenceOffsets(const int& refId) { - if ( m_indexData.find(refId) == m_indexData.end() ) return; - vector& offsets = m_indexData[refId].Offsets; - offsets.clear(); - m_hasFullDataCache = false; -} - -// return file position after header metadata -const off_t BamToolsIndex::DataBeginOffset(void) const { - return m_dataBeginOffset; -} - -// calculate BAM file offset for desired region -// return true if no error (*NOT* equivalent to "has alignments or valid offset") -// check @hasAlignmentsInRegion to determine this status -// @region - target region -// @offset - resulting seek target -// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status -// N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { - - // return false if leftBound refID is not found in index data - BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; - - // load index data for region if not already cached - if ( !IsDataLoaded(region.LeftRefID) ) { - bool loadedOk = true; - loadedOk &= SkipToReference(region.LeftRefID); - loadedOk &= LoadReference(region.LeftRefID); - if ( !loadedOk ) return false; - } - - // localize index data for this reference (& sanity check that data actually exists) - indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; - const vector& referenceOffsets = (*indexIter).second.Offsets; - if ( referenceOffsets.empty() ) return false; - - // ------------------------------------------------------- - // calculate nearest index to jump to - - // save first offset - offset = (*referenceOffsets.begin()).StartOffset; - - // iterate over offsets entries on this reference - vector::const_iterator offsetIter = referenceOffsets.begin(); - vector::const_iterator offsetEnd = referenceOffsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - const BamToolsIndexEntry& entry = (*offsetIter); - // break if alignment 'entry' overlaps region - if ( entry.MaxEndPosition >= region.LeftPosition ) break; - offset = (*offsetIter).StartOffset; - } - - // set flag based on whether an index entry was found for this region - *hasAlignmentsInRegion = ( offsetIter != offsetEnd ); - - // if cache mode set to none, dump the data we just loaded - if (m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); - - // return success - return true; -} - -// returns whether reference has alignments or no -bool BamToolsIndex::HasAlignments(const int& refId) const { - - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - return refEntry.HasAlignments; -} - -// return true if all index data is cached -bool BamToolsIndex::HasFullDataCache(void) const { - return m_hasFullDataCache; -} - -// returns true if index cache has data for desired reference -bool BamToolsIndex::IsDataLoaded(const int& refId) const { - - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - - if ( !refEntry.HasAlignments ) return true; // no data period - - // return whether offsets list contains data - return !refEntry.Offsets.empty(); -} - -// attempts to use index to jump to region; returns success/fail -bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { - - // clear flag - *hasAlignmentsInRegion = false; - - // check valid BamReader state - if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { - fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); - return false; - } - - // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) - return false; - - // calculate nearest offset to jump to - int64_t offset; - if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { - fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n"); - return false; - } - - // return success/failure of seek - return m_BGZF->Seek(offset); -} - -// clears index data from all references except the first -void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - KeepOnlyReferenceOffsets( (*indexBegin).first ); -} - -// clears index data from all references except the one specified -void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) { - BamToolsIndexData::iterator mapIter = m_indexData.begin(); - BamToolsIndexData::iterator mapEnd = m_indexData.end(); - for ( ; mapIter != mapEnd; ++mapIter ) { - const int entryRefId = (*mapIter).first; - if ( entryRefId != refId ) - ClearReferenceOffsets(entryRefId); - } -} - -// load index data for all references, return true if loaded OK -bool BamToolsIndex::LoadAllReferences(bool saveData) { - - // skip if data already loaded - if ( m_hasFullDataCache ) return true; - - // read in number of references - int32_t numReferences; - if ( !LoadReferenceCount(numReferences) ) return false; - //SetReferenceCount(numReferences); - - // iterate over reference entries - bool loadedOk = true; - for ( int i = 0; i < numReferences; ++i ) - loadedOk &= LoadReference(i, saveData); - - // set flag - if ( loadedOk && saveData ) - m_hasFullDataCache = true; - - // return success/failure of load - return loadedOk; -} - -// load header data from index file, return true if loaded OK -bool BamToolsIndex::LoadHeader(void) { - - // check magic number - if ( !CheckMagicNumber() ) return false; - - // check BTI version - if ( !CheckVersion() ) return false; - - // read in block size - size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(m_blockSize); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of load - return (elementsRead == 1); -} - -// load a single index entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) { - - // read in index entry data members - size_t elementsRead = 0; - BamToolsIndexEntry entry; - elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream); - elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream); - elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream); - if ( elementsRead != 3 ) { - cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl; - return false; - } - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(entry.MaxEndPosition); - SwapEndian_64(entry.StartOffset); - SwapEndian_32(entry.StartPosition); - } - - // save data - if ( saveData ) - SaveOffsetEntry(refId, entry); - - // return success/failure of load - return true; -} - -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadFirstReference(bool saveData) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return LoadReference( (*indexBegin).first, saveData ); -} - -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadReference(const int& refId, bool saveData) { - - // read in number of offsets for this reference - uint32_t numOffsets; - size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numOffsets); - - // initialize offsets container for this reference - SetOffsetCount(refId, (int)numOffsets); - - // iterate over offset entries - for ( unsigned int j = 0; j < numOffsets; ++j ) - LoadIndexEntry(refId, saveData); - - // return success/failure of load - return true; -} - -// loads number of references, return true if loaded OK -bool BamToolsIndex::LoadReferenceCount(int& numReferences) { - - size_t elementsRead = 0; - - // read reference count - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // return success/failure of load - return ( elementsRead == 1 ); -} - -// saves an index offset entry in memory -void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.HasAlignments = true; - refEntry.Offsets.push_back(entry); -} - -// pre-allocates size for offset vector -void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.Offsets.reserve(offsetCount); - refEntry.HasAlignments = ( offsetCount > 0); -} - -// initializes index data structure to hold @count references -void BamToolsIndex::SetReferenceCount(const int& count) { - for ( int i = 0; i < count; ++i ) - m_indexData[i].HasAlignments = false; -} - -// position file pointer to first reference begin, return true if skipped OK -bool BamToolsIndex::SkipToFirstReference(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return SkipToReference( (*indexBegin).first ); -} - -// position file pointer to desired reference begin, return true if skipped OK -bool BamToolsIndex::SkipToReference(const int& refId) { - - // attempt rewind - if ( !Rewind() ) return false; - - // read in number of references - int32_t numReferences; - size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // iterate over reference entries - bool skippedOk = true; - int currentRefId = 0; - while (currentRefId != refId) { - skippedOk &= LoadReference(currentRefId, false); - ++currentRefId; - } - - // return success/failure of skip - return skippedOk; -} - -// write header to new index file -bool BamToolsIndex::WriteHeader(void) { - - size_t elementsWritten = 0; - - // write BTI index format 'magic number' - elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream); - - // write BTI index format version - int32_t currentVersion = (int32_t)m_outputVersion; - if ( m_isBigEndian ) SwapEndian_32(currentVersion); - elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream); - - // write block size - int32_t blockSize = m_blockSize; - if ( m_isBigEndian ) SwapEndian_32(blockSize); - elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of write - return ( elementsWritten == 6 ); -} - -// write index data for all references to new index file -bool BamToolsIndex::WriteAllReferences(void) { - - size_t elementsWritten = 0; - - // write number of references - int32_t numReferences = (int32_t)m_indexData.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); - - // iterate through references in index - bool refOk = true; - BamToolsIndexData::const_iterator refIter = m_indexData.begin(); - BamToolsIndexData::const_iterator refEnd = m_indexData.end(); - for ( ; refIter != refEnd; ++refIter ) - refOk &= WriteReferenceEntry( (*refIter).second ); - - return ( (elementsWritten == 1) && refOk ); -} - -// write current reference index data to new index file -bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) { - - size_t elementsWritten = 0; - - // write number of offsets listed for this reference - uint32_t numOffsets = refEntry.Offsets.size(); - if ( m_isBigEndian ) SwapEndian_32(numOffsets); - elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - - // iterate over offset entries - bool entriesOk = true; - vector::const_iterator offsetIter = refEntry.Offsets.begin(); - vector::const_iterator offsetEnd = refEntry.Offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) - entriesOk &= WriteIndexEntry( (*offsetIter) ); - - return ( (elementsWritten == 1) && entriesOk ); -} - -// write current index offset entry to new index file -bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) { - - // copy entry data - int32_t maxEndPosition = entry.MaxEndPosition; - int64_t startOffset = entry.StartOffset; - int32_t startPosition = entry.StartPosition; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // write the reference index entry - size_t elementsWritten = 0; - elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); - elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); - elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); - return ( elementsWritten == 3 ); -} diff --git a/src/api/BamToolsIndex.h b/src/api/BamToolsIndex.h deleted file mode 100644 index c99834d..0000000 --- a/src/api/BamToolsIndex.h +++ /dev/null @@ -1,192 +0,0 @@ -// *************************************************************************** -// BamToolsIndex.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the BamTools index format (".bti") -// *************************************************************************** - -#ifndef BAMTOOLS_INDEX_FORMAT_H -#define BAMTOOLS_INDEX_FORMAT_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include -#include -#include -#include -#include - -namespace BamTools { - -namespace Internal { - -// individual index offset entry -struct BamToolsIndexEntry { - - // data members - int32_t MaxEndPosition; - int64_t StartOffset; - int32_t StartPosition; - - // ctor - BamToolsIndexEntry(const int32_t& maxEndPosition = 0, - const int64_t& startOffset = 0, - const int32_t& startPosition = 0) - : MaxEndPosition(maxEndPosition) - , StartOffset(startOffset) - , StartPosition(startPosition) - { } -}; - -// reference index entry -struct BamToolsReferenceEntry { - - // data members - bool HasAlignments; - std::vector Offsets; - - // ctor - BamToolsReferenceEntry(void) - : HasAlignments(false) - { } -}; - -// the actual index data structure -typedef std::map BamToolsIndexData; - -class BamToolsIndex : public BamIndex { - - // keep a list of any supported versions here - // (might be useful later to handle any 'legacy' versions if the format changes) - // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on - // - // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by: - // - // if ( indexVersion >= BTI_1_2 ) - // do something new - // else - // do the old thing - enum Version { BTI_1_0 = 1 - , BTI_1_1 - , BTI_1_2 - }; - - - // ctor & dtor - public: - BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); - ~BamToolsIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const std::string Extension(void) const { return std::string(".bti"); } - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - const off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); - - // 'internal' methods - public: - - // ----------------------- - // index file operations - - // check index file magic number, return true if OK - bool CheckMagicNumber(void); - // check index file version, return true if OK - bool CheckVersion(void); - // return true if FILE* is open - bool IsOpen(void) const; - // load a single index entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadIndexEntry(const int& refId, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write current reference index data to new index file - bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); - // write current index offset entry to new index file - bool WriteIndexEntry(const BamToolsIndexEntry& entry); - - // ----------------------- - // index data operations - - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculate BAM file offset for desired region - // return true if no error (*NOT* equivalent to "has alignments or valid offset") - // check @hasAlignmentsInRegion to determine this status - // @region - target region - // @offset - resulting seek target - // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status - bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // saves an index offset entry in memory - void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); - // pre-allocates size for offset vector - void SetOffsetCount(const int& refId, const int& offsetCount); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); - - // data members - private: - int32_t m_blockSize; - BamToolsIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; - bool m_isBigEndian; - int32_t m_inputVersion; // Version is serialized as int - Version m_outputVersion; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMTOOLS_INDEX_FORMAT_H diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp index 2af45cb..08ec3fe 100644 --- a/src/api/BamWriter.cpp +++ b/src/api/BamWriter.cpp @@ -3,13 +3,13 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 22 November 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** #include -#include +#include using namespace BamTools; using namespace BamTools::Internal; diff --git a/src/api/BamWriter_p.cpp b/src/api/BamWriter_p.cpp deleted file mode 100644 index 9887d56..0000000 --- a/src/api/BamWriter_p.cpp +++ /dev/null @@ -1,379 +0,0 @@ -// *************************************************************************** -// BamWriter_p.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include -#include -using namespace BamTools; -using namespace BamTools::Internal; -using namespace std; - -BamWriterPrivate::BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); -} - -BamWriterPrivate::~BamWriterPrivate(void) { - mBGZF.Close(); -} - -// closes the alignment archive -void BamWriterPrivate::Close(void) { - mBGZF.Close(); -} - -// calculates minimum bin for a BAM alignment interval -const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { - --end; - if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); - if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); - if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); - if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); - if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); - return 0; -} - -// creates a cigar string from the supplied alignment -void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { - - // initialize - const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); - - // pack the cigar data into the string - unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - - unsigned int cigarOp; - vector::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { - - switch(coIter->Type) { - case 'M': - cigarOp = BAM_CMATCH; - break; - case 'I': - cigarOp = BAM_CINS; - break; - case 'D': - cigarOp = BAM_CDEL; - break; - case 'N': - cigarOp = BAM_CREF_SKIP; - break; - case 'S': - cigarOp = BAM_CSOFT_CLIP; - break; - case 'H': - cigarOp = BAM_CHARD_CLIP; - break; - case 'P': - cigarOp = BAM_CPAD; - break; - default: - fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); - } - - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; - } -} - -// encodes the supplied query sequence into 4-bit notation -void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { - - // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); - char* pEncodedQuery = (char*)encodedQuery.data(); - const char* pQuery = (const char*)query.data(); - - unsigned char nucleotideCode; - bool useHighWord = true; - - while(*pQuery) { - - switch(*pQuery) { - - case '=': - nucleotideCode = 0; - break; - - case 'A': - nucleotideCode = 1; - break; - - case 'C': - nucleotideCode = 2; - break; - - case 'G': - nucleotideCode = 4; - break; - - case 'T': - nucleotideCode = 8; - break; - - case 'N': - nucleotideCode = 15; - break; - - default: - fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); - } - - // pack the nucleotide code - if(useHighWord) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; - useHighWord = true; - } - - // increment the query position - pQuery++; - } -} - -// opens the alignment archive -bool BamWriterPrivate::Open(const string& filename, - const string& samHeader, - const RefVector& referenceSequences, - bool isWriteUncompressed) -{ - // open the BGZF file for writing, return failure if error - if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) - return false; - - // ================ - // write the header - // ================ - - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); - - // write the SAM header text length - uint32_t samHeaderLen = samHeader.size(); - if (IsBigEndian) SwapEndian_32(samHeaderLen); - mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); - - // write the SAM header text - if(samHeaderLen > 0) - mBGZF.Write(samHeader.data(), samHeaderLen); - - // write the number of reference sequences - uint32_t numReferenceSequences = referenceSequences.size(); - if (IsBigEndian) SwapEndian_32(numReferenceSequences); - mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); - - // ============================= - // write the sequence dictionary - // ============================= - - RefVector::const_iterator rsIter; - for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { - - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); - mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); - - // write the reference sequence name - mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if (IsBigEndian) SwapEndian_32(referenceLength); - mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); - } - - // return success - return true; -} - -// saves the alignment to the alignment archive -void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); - } - - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) - else { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; - unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the query name - mBGZF.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( IsBigEndian ) { - - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - - for (unsigned int i = 0; i < packedCigarLength; ++i) { - if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); - } - - mBGZF.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - mBGZF.Write(packedCigar.data(), packedCigarLength); - - // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLength); - - // write the base qualities - string baseQualities(al.Qualities); - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) { - pBaseQualities[i] -= 33; - } - mBGZF.Write(pBaseQualities, queryLength); - - // write the read group tag - if ( IsBigEndian ) { - - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); - - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i+=2; // sizeof(uint16_t) - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i+=4; // sizeof(uint32_t) - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i+=8; // sizeof(uint64_t) - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); - } - } - - mBGZF.Write(tagData, tagDataLength); - free(tagData); - } - else - mBGZF.Write(al.TagData.data(), tagDataLength); - } -} diff --git a/src/api/BamWriter_p.h b/src/api/BamWriter_p.h deleted file mode 100644 index 4fe626a..0000000 --- a/src/api/BamWriter_p.h +++ /dev/null @@ -1,63 +0,0 @@ -// *************************************************************************** -// BamWriter_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#ifndef BAMWRITER_P_H -#define BAMWRITER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include -#include -#include -#include - -namespace BamTools { -namespace Internal { - -class BamWriterPrivate { - - // ctor & dtor - public: - BamWriterPrivate(void); - ~BamWriterPrivate(void); - - // "public" interface to BamWriter - public: - void Close(void); - bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, - bool isWriteUncompressed); - void SaveAlignment(const BamAlignment& al); - - // internal methods - public: - const unsigned int CalculateMinimumBin(const int begin, int end) const; - void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); - void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); - - // data members - public: - BgzfData mBGZF; - bool IsBigEndian; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMWRITER_P_H diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 7c1b8ff..fb2d485 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -17,12 +17,12 @@ add_library ( BamTools SHARED BamIndex.cpp BamMultiReader.cpp BamReader.cpp - BamReader_p.cpp - BamStandardIndex.cpp - BamToolsIndex.cpp BamWriter.cpp - BamWriter_p.cpp - BGZF.cpp + BGZF.cpp + internal/BamReader_p.cpp + internal/BamStandardIndex_p.cpp + internal/BamToolsIndex_p.cpp + internal/BamWriter_p.cpp ) # link BamTools library with zlib automatically @@ -33,3 +33,4 @@ set_target_properties( BamTools PROPERTIES SOVERSION 0.9.0 OUTPUT_NAME bamtools ) + diff --git a/src/api/internal/BamReader_p.cpp b/src/api/internal/BamReader_p.cpp new file mode 100644 index 0000000..bffbfba --- /dev/null +++ b/src/api/internal/BamReader_p.cpp @@ -0,0 +1,719 @@ +// *************************************************************************** +// BamReader_p.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#include +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +using namespace std; + +// constructor +BamReaderPrivate::BamReaderPrivate(BamReader* parent) + : HeaderText("") + , Index(0) + , HasIndex(false) + , AlignmentsBeginOffset(0) +// , m_header(0) + , IndexCacheMode(BamIndex::LimitedIndexCaching) + , HasAlignmentsInRegion(true) + , Parent(parent) + , DNA_LOOKUP("=ACMGRSVTWYHKDBN") + , CIGAR_LOOKUP("MIDNSHP") +{ + IsBigEndian = SystemIsBigEndian(); +} + +// destructor +BamReaderPrivate::~BamReaderPrivate(void) { + Close(); +} + +// adjusts requested region if necessary (depending on where data actually begins) +void BamReaderPrivate::AdjustRegion(BamRegion& region) { + + // check for valid index first + if ( Index == 0 ) return; + + // see if any references in region have alignments + HasAlignmentsInRegion = false; + int currentId = region.LeftRefID; + while ( currentId <= region.RightRefID ) { + HasAlignmentsInRegion = Index->HasAlignments(currentId); + if ( HasAlignmentsInRegion ) break; + ++currentId; + } + + // if no data found on any reference in region + if ( !HasAlignmentsInRegion ) return; + + // if left bound of desired region had no data, use first reference that had data + // otherwise, leave requested region as-is + if ( currentId != region.LeftRefID ) { + region.LeftRefID = currentId; + region.LeftPosition = 0; + } +} + +// fills out character data for BamAlignment data +bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { + + // calculate character lengths/offsets + const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; + + // set up char buffers + const char* allCharData = bAlignment.SupportData.AllCharData.data(); + const char* seqData = ((const char*)allCharData) + seqDataOffset; + const char* qualData = ((const char*)allCharData) + qualDataOffset; + char* tagData = ((char*)allCharData) + tagDataOffset; + + // store alignment name (relies on null char in name as terminator) + bAlignment.Name.assign((const char*)(allCharData)); + + // save query sequence + bAlignment.QueryBases.clear(); + bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); + for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { + char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + bAlignment.QueryBases.append(1, singleBase); + } + + // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character + bAlignment.Qualities.clear(); + bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); + for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { + char singleQuality = (char)(qualData[i]+33); + bAlignment.Qualities.append(1, singleQuality); + } + + // if QueryBases is empty (and this is a allowed case) + if ( bAlignment.QueryBases.empty() ) + bAlignment.AlignedBases = bAlignment.QueryBases; + + // if QueryBases contains data, then build AlignedBases using CIGAR data + else { + + // resize AlignedBases + bAlignment.AlignedBases.clear(); + bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); + + // iterate over CigarOps + int k = 0; + vector::const_iterator cigarIter = bAlignment.CigarData.begin(); + vector::const_iterator cigarEnd = bAlignment.CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + + const CigarOp& op = (*cigarIter); + switch(op.Type) { + + case ('M') : + case ('I') : + bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases + // fall through + + case ('S') : + k += op.Length; // for 'S' - soft clip, skip over query bases + break; + + case ('D') : + bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character + break; + + case ('P') : + bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character + break; + + case ('N') : + bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence + break; + + case ('H') : + break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op + + default: + fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); + } + } + } + + // ----------------------- + // Added: 3-25-2010 DB + // Fixed: endian-correctness for tag data + // ----------------------- + if ( IsBigEndian ) { + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i += sizeof(uint64_t); + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here + exit(1); + } + } + } + + // store TagData + bAlignment.TagData.clear(); + bAlignment.TagData.resize(tagDataLength); + memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); + + // clear the core-only flag + bAlignment.SupportData.HasCoreOnly = false; + + // return success + return true; +} + +// clear index data structure +void BamReaderPrivate::ClearIndex(void) { + delete Index; + Index = 0; + HasIndex = false; +} + +// closes the BAM file +void BamReaderPrivate::Close(void) { + + // close BGZF file stream + mBGZF.Close(); + + // clear out index data + ClearIndex(); + + // clear out header data + HeaderText.clear(); +// if ( m_header ) { +// delete m_header; +// m_header = 0; +// } + + // clear out region flags + Region.clear(); +} + +// creates index for BAM file, saves to file +// default behavior is to create the BAM standard index (".bai") +// set flag to false to create the BamTools-specific index (".bti") +bool BamReaderPrivate::CreateIndex(bool useStandardIndex) { + + // clear out prior index data + ClearIndex(); + + // create index based on type requested + if ( useStandardIndex ) + Index = new BamStandardIndex(&mBGZF, Parent); + else + Index = new BamToolsIndex(&mBGZF, Parent); + + // set index cache mode to full for writing + Index->SetCacheMode(BamIndex::FullIndexCaching); + + // build new index + bool ok = true; + ok &= Index->Build(); + HasIndex = ok; + + // mark empty references + MarkReferences(); + + // attempt to save index data to file + ok &= Index->Write(Filename); + + // set client's desired index cache mode + Index->SetCacheMode(IndexCacheMode); + + // return success/fail of both building & writing index + return ok; +} + +const string BamReaderPrivate::GetHeaderText(void) const { + + return HeaderText; + +// if ( m_header ) +// return m_header->Text(); +// else +// return string(""); +} + +// get next alignment (from specified region, if given) +bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { + + // if valid alignment found, attempt to parse char data, and return success/failure + if ( GetNextAlignmentCore(bAlignment) ) + return BuildCharData(bAlignment); + + // no valid alignment found + else return false; +} + +// retrieves next available alignment core data (returns success/fail) +// ** DOES NOT parse any character data (read name, bases, qualities, tag data) +// these can be accessed, if necessary, from the supportData +// useful for operations requiring ONLY positional or other alignment-related information +bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { + + // if region is set but has no alignments + if ( !Region.isNull() && !HasAlignmentsInRegion ) + return false; + + // if valid alignment available + if ( LoadNextAlignment(bAlignment) ) { + + // set core-only flag + bAlignment.SupportData.HasCoreOnly = true; + + // if region not specified with at least a left boundary, return success + if ( !Region.isLeftBoundSpecified() ) return true; + + // determine region state (before, within, after) + BamReaderPrivate::RegionState state = IsOverlap(bAlignment); + + // if alignment lies after region, return false + if ( state == AFTER_REGION ) return false; + + while ( state != WITHIN_REGION ) { + // if no valid alignment available (likely EOF) return failure + if ( !LoadNextAlignment(bAlignment) ) return false; + // if alignment lies after region, return false (no available read within region) + state = IsOverlap(bAlignment); + if ( state == AFTER_REGION ) return false; + } + + // return success (alignment found that overlaps region) + return true; + } + + // no valid alignment + else return false; +} + +// returns RefID for given RefName (returns References.size() if not found) +int BamReaderPrivate::GetReferenceID(const string& refName) const { + + // retrieve names from reference data + vector refNames; + RefVector::const_iterator refIter = References.begin(); + RefVector::const_iterator refEnd = References.end(); + for ( ; refIter != refEnd; ++refIter) + refNames.push_back( (*refIter).RefName ); + + // return 'index-of' refName ( if not found, returns refNames.size() ) + return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); +} + +// returns region state - whether alignment ends before, overlaps, or starts after currently specified region +// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true +BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { + + // if alignment is on any reference sequence before left bound + if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION; + + // if alignment starts on left bound reference + else if ( bAlignment.RefID == Region.LeftRefID ) { + + // if alignment starts at or after left boundary + if ( bAlignment.Position >= Region.LeftPosition) { + + // if right boundary is specified AND + // left/right boundaries are on same reference AND + // alignment starts past right boundary + if ( Region.isRightBoundSpecified() && + Region.LeftRefID == Region.RightRefID && + bAlignment.Position > Region.RightPosition ) + return AFTER_REGION; + + // otherwise, alignment is within region + return WITHIN_REGION; + } + + // alignment starts before left boundary + else { + // check if alignment overlaps left boundary + if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION; + else return BEFORE_REGION; + } + } + + // alignment starts on a reference after the left bound + else { + + // if region has a right boundary + if ( Region.isRightBoundSpecified() ) { + + // alignment is on reference between boundaries + if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION; + + // alignment is on reference after right boundary + else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION; + + // alignment is on right bound reference + else { + // check if alignment starts before or at right boundary + if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION; + else return AFTER_REGION; + } + } + + // otherwise, alignment is after left bound reference, but there is no right boundary + else return WITHIN_REGION; + } +} + +// load BAM header data +void BamReaderPrivate::LoadHeaderData(void) { + +// m_header = new BamHeader(&mBGZF); +// bool headerLoadedOk = m_header->Load(); +// if ( !headerLoadedOk ) +// cerr << "BamReader could not load header" << endl; + + // check to see if proper BAM header + char buffer[4]; + if (mBGZF.Read(buffer, 4) != 4) { + fprintf(stderr, "Could not read header type\n"); + exit(1); + } + + if (strncmp(buffer, "BAM\001", 4)) { + fprintf(stderr, "wrong header type!\n"); + exit(1); + } + + // get BAM header text length + mBGZF.Read(buffer, 4); + unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(headerTextLength); + + // get BAM header text + char* headerText = (char*)calloc(headerTextLength + 1, 1); + mBGZF.Read(headerText, headerTextLength); + HeaderText = (string)((const char*)headerText); + + // clean up calloc-ed temp variable + free(headerText); +} + +// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail +bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) { + + // clear out any existing index data + ClearIndex(); + + // if no index filename provided, so we need to look for available index files + if ( IndexFilename.empty() ) { + + // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag + const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); + Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type); + + // if null, return failure + if ( Index == 0 ) return false; + + // generate proper IndexFilename based on type of index created + IndexFilename = Filename + Index->Extension(); + } + + else { + + // attempt to load BamIndex based on IndexFilename provided by client + Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent); + + // if null, return failure + if ( Index == 0 ) return false; + } + + // set cache mode for BamIndex + Index->SetCacheMode(IndexCacheMode); + + // loading the index data from file + HasIndex = Index->Load(IndexFilename); + + // mark empty references + MarkReferences(); + + // return index status + return HasIndex; +} + +// populates BamAlignment with alignment data under file pointer, returns success/fail +bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { + + // read in the 'block length' value, make sure it's not zero + char buffer[4]; + mBGZF.Read(buffer, 4); + bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } + if ( bAlignment.SupportData.BlockLength == 0 ) return false; + + // read in core alignment data, make sure the right size of data was read + char x[BAM_CORE_SIZE]; + if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; + + if ( IsBigEndian ) { + for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) + SwapEndian_32p(&x[i]); + } + + // set BamAlignment 'core' and 'support' data + bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); + bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); + + unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); + bAlignment.Bin = tempValue >> 16; + bAlignment.MapQuality = tempValue >> 8 & 0xff; + bAlignment.SupportData.QueryNameLength = tempValue & 0xff; + + tempValue = BgzfData::UnpackUnsignedInt(&x[12]); + bAlignment.AlignmentFlag = tempValue >> 16; + bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; + + bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); + bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); + bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); + bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); + + // set BamAlignment length + bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; + + // read in character data - make sure proper data size was read + bool readCharDataOK = false; + const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + char* allCharData = (char*)calloc(sizeof(char), dataLength); + + if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { + + // store 'allCharData' in supportData structure + bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); + + // set success flag + readCharDataOK = true; + + // save CIGAR ops + // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, + // even when GetNextAlignmentCore() is called + const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; + uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + CigarOp op; + bAlignment.CigarData.clear(); + bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); + for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { + + // swap if necessary + if ( IsBigEndian ) SwapEndian_32(cigarData[i]); + + // build CigarOp structure + op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); + op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + + // save CigarOp + bAlignment.CigarData.push_back(op); + } + } + + free(allCharData); + return readCharDataOK; +} + +// loads reference data from BAM file +void BamReaderPrivate::LoadReferenceData(void) { + + // get number of reference sequences + char buffer[4]; + mBGZF.Read(buffer, 4); + unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(numberRefSeqs); + if ( numberRefSeqs == 0 ) return; + References.reserve((int)numberRefSeqs); + + // iterate over all references in header + for (unsigned int i = 0; i != numberRefSeqs; ++i) { + + // get length of reference name + mBGZF.Read(buffer, 4); + unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(refNameLength); + char* refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + mBGZF.Read(refName, refNameLength); + mBGZF.Read(buffer, 4); + int refLength = BgzfData::UnpackSignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(refLength); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + References.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } +} + +// mark references with no alignment data +void BamReaderPrivate::MarkReferences(void) { + + // ensure index is available + if ( !HasIndex ) return; + + // mark empty references + for ( int i = 0; i < (int)References.size(); ++i ) + References.at(i).RefHasAlignments = Index->HasAlignments(i); +} + +// opens BAM file (and index) +bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) { + + // store filenames + Filename = filename; + IndexFilename = indexFilename; + + // open the BGZF file for reading, return false on failure + if ( !mBGZF.Open(filename, "rb") ) return false; + + // retrieve header text & reference data + LoadHeaderData(); + LoadReferenceData(); + + // store file offset of first alignment + AlignmentsBeginOffset = mBGZF.Tell(); + + // if no index filename provided + if ( IndexFilename.empty() ) { + + // client did not specify that index SHOULD be found + // useful for cases where sequential access is all that is required + if ( !lookForIndex ) return true; + + // otherwise, look for index file, return success/fail + return LoadIndex(lookForIndex, preferStandardIndex) ; + } + + // client supplied an index filename + // attempt to load index data, return success/fail + return LoadIndex(lookForIndex, preferStandardIndex); +} + +// returns BAM file pointer to beginning of alignment data +bool BamReaderPrivate::Rewind(void) { + + // rewind to first alignment, return false if unable to seek + if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; + + // retrieve first alignment data, return false if unable to read + BamAlignment al; + if ( !LoadNextAlignment(al) ) return false; + + // reset default region info using first alignment in file + Region.clear(); + HasAlignmentsInRegion = true; + + // rewind back to beginning of first alignment + // return success/fail of seek + return mBGZF.Seek(AlignmentsBeginOffset); +} + +// change the index caching behavior +void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { + IndexCacheMode = mode; + if ( Index == 0 ) return; + Index->SetCacheMode(mode); +} + +// asks Index to attempt a Jump() to specified region +// returns success/failure +bool BamReaderPrivate::SetRegion(const BamRegion& region) { + + // clear out any prior BamReader region data + // + // N.B. - this is cleared so that BamIndex now has free reign to call + // GetNextAlignmentCore() and do overlap checking without worrying about BamReader + // performing any overlap checking of its own and moving on to the next read... Calls + // to GetNextAlignmentCore() with no Region set, simply return the next alignment. + // This ensures that the Index is able to do just that. (All without exposing + // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature) + Region.clear(); + + // check for existing index + if ( !HasIndex ) return false; + + // adjust region if necessary to reflect where data actually begins + BamRegion adjustedRegion(region); + AdjustRegion(adjustedRegion); + + // if no data present, return true + // not an error, but BamReader knows that no data is there for future alignment access + // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions + // that other BAMs have data) + if ( !HasAlignmentsInRegion ) { + Region = adjustedRegion; + return true; + } + + // attempt jump to user-specified region return false if jump could not be performed at all + // (invalid index, unknown reference, etc) + // + // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag + // * This covers case where a region is requested that lies beyond the last alignment on a reference + // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false + // BamMultiReader is then able to successfully pull alignments from a region from multiple files + // even if one or more have no data. + if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false; + + // save region and return success + Region = adjustedRegion; + return true; +} diff --git a/src/api/internal/BamReader_p.h b/src/api/internal/BamReader_p.h new file mode 100644 index 0000000..8c3172f --- /dev/null +++ b/src/api/internal/BamReader_p.h @@ -0,0 +1,137 @@ +// *************************************************************************** +// BamReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#ifndef BAMREADER_P_H +#define BAMREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include +#include +#include + +namespace BamTools { + +class BamReader; + +namespace Internal { + +class BamReaderPrivate { + + // enums + public: enum RegionState { BEFORE_REGION = 0 + , WITHIN_REGION + , AFTER_REGION + }; + + // ctor & dtor + public: + BamReaderPrivate(BamReader* parent); + ~BamReaderPrivate(void); + + // 'public' interface to BamReader + public: + + // file operations + void Close(void); + bool Open(const std::string& filename, + const std::string& indexFilename, + const bool lookForIndex, + const bool preferStandardIndex); + bool Rewind(void); + bool SetRegion(const BamRegion& region); + + // access alignment data + bool GetNextAlignment(BamAlignment& bAlignment); + bool GetNextAlignmentCore(BamAlignment& bAlignment); + + // access auxiliary data + const std::string GetHeaderText(void) const; + int GetReferenceID(const std::string& refName) const; + + // index operations + bool CreateIndex(bool useStandardIndex); + void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + + // 'internal' methods + public: + + // --------------------------------------- + // reading alignments and auxiliary data + + // adjusts requested region if necessary (depending on where data actually begins) + void AdjustRegion(BamRegion& region); + // fills out character data for BamAlignment data + bool BuildCharData(BamAlignment& bAlignment); + // checks to see if alignment overlaps current region + RegionState IsOverlap(BamAlignment& bAlignment); + // retrieves header text from BAM file + void LoadHeaderData(void); + // retrieves BAM alignment under file pointer + bool LoadNextAlignment(BamAlignment& bAlignment); + // builds reference data structure from BAM file + void LoadReferenceData(void); + // mark references with 'HasAlignments' status + void MarkReferences(void); + + // --------------------------------- + // index file handling + + // clear out inernal index data structure + void ClearIndex(void); + // loads index from BAM index file + bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); + + // data members + public: + + // general file data + BgzfData mBGZF; + std::string HeaderText; + BamIndex* Index; + RefVector References; + bool HasIndex; + int64_t AlignmentsBeginOffset; + std::string Filename; + std::string IndexFilename; + +// Internal::BamHeader* m_header; + + // index caching mode + BamIndex::BamIndexCacheMode IndexCacheMode; + + // system data + bool IsBigEndian; + + // user-specified region values + BamRegion Region; + bool HasAlignmentsInRegion; + + // parent BamReader + BamReader* Parent; + + // BAM character constants + const char* DNA_LOOKUP; + const char* CIGAR_LOOKUP; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMREADER_P_H diff --git a/src/api/internal/BamStandardIndex_p.cpp b/src/api/internal/BamStandardIndex_p.cpp new file mode 100644 index 0000000..c90aef6 --- /dev/null +++ b/src/api/internal/BamStandardIndex_p.cpp @@ -0,0 +1,910 @@ +// *************************************************************************** +// BamStandardIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +#include +using namespace std; + +BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) + : BamIndex(bgzf, reader) + , m_dataBeginOffset(0) + , m_hasFullDataCache(false) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +BamStandardIndex::~BamStandardIndex(void) { + ClearAllData(); +} + +// calculate bins that overlap region +int BamStandardIndex::BinsFromRegion(const BamRegion& region, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]) +{ + // get region boundaries + uint32_t begin = (unsigned int)region.LeftPosition; + uint32_t end; + + // if right bound specified AND left&right bounds are on same reference + // OK to use right bound position + if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) + end = (unsigned int)region.RightPosition; + + // otherwise, use end of left bound reference as cutoff + else + end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + bins[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } + + // return number of bins stored + return i; +} + +// creates index data (in-memory) from current reader data +bool BamStandardIndex::Build(void) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // move file pointer to beginning of alignments + m_reader->Rewind(); + + // get reference count, reserve index space + const int numReferences = (int)m_references.size(); + m_indexData.clear(); + m_hasFullDataCache = false; + SetReferenceCount(numReferences); + + // sets default constant for bin, ID, offset, coordinate variables + const uint32_t defaultValue = 0xffffffffu; + + // bin data + uint32_t saveBin(defaultValue); + uint32_t lastBin(defaultValue); + + // reference ID data + int32_t saveRefID(defaultValue); + int32_t lastRefID(defaultValue); + + // offset data + uint64_t saveOffset = m_BGZF->Tell(); + uint64_t lastOffset = saveOffset; + + // coordinate data + int32_t lastCoordinate = defaultValue; + + BamAlignment bAlignment; + while ( m_reader->GetNextAlignmentCore(bAlignment) ) { + + // change of chromosome, save ID, reset bin + if ( lastRefID != bAlignment.RefID ) { + lastRefID = bAlignment.RefID; + lastBin = defaultValue; + } + + // if lastCoordinate greater than BAM position - file not sorted properly + else if ( lastCoordinate > bAlignment.Position ) { + fprintf(stderr, "BAM file not properly sorted:\n"); + fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), + lastCoordinate, bAlignment.Position, bAlignment.RefID); + exit(1); + } + + // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) + if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { + + // save linear offset entry (matched to BAM entry refID) + BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + LinearOffsetVector& offsets = refIndex.Offsets; + SaveLinearOffset(offsets, bAlignment, lastOffset); + } + + // if current BamAlignment bin != lastBin, "then possibly write the binning index" + if ( bAlignment.Bin != lastBin ) { + + // if not first time through + if ( saveBin != defaultValue ) { + + // save Bam bin entry + BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& binMap = refIndex.Bins; + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // update saveOffset + saveOffset = lastOffset; + + // update bin values + saveBin = bAlignment.Bin; + lastBin = bAlignment.Bin; + + // update saveRefID + saveRefID = bAlignment.RefID; + + // if invalid RefID, break out + if ( saveRefID < 0 ) break; + } + + // make sure that current file pointer is beyond lastOffset + if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { + fprintf(stderr, "Error in BGZF offsets.\n"); + exit(1); + } + + // update lastOffset + lastOffset = m_BGZF->Tell(); + + // update lastCoordinate + lastCoordinate = bAlignment.Position; + } + + // save any leftover BAM data (as long as refID is valid) + if ( saveRefID >= 0 ) { + // save Bam bin entry + BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& binMap = refIndex.Bins; + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // simplify index by merging chunks + MergeChunks(); + + // iterate through references in index + // sort offsets in linear offset vector + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); + for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { + + // get reference index data + ReferenceIndex& refIndex = (*indexIter).second; + LinearOffsetVector& offsets = refIndex.Offsets; + + // sort linear offsets + sort(offsets.begin(), offsets.end()); + } + + // rewind file pointer to beginning of alignments, return success/fail + return m_reader->Rewind(); +} + +// check index file magic number, return true if OK +bool BamStandardIndex::CheckMagicNumber(void) { + + // read in magic number + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); + + // compare to expected value + if ( strncmp(magic, "BAI\1", 4) != 0 ) { + fprintf(stderr, "Problem with index file - invalid format.\n"); + fclose(m_indexStream); + return false; + } + + // return success/failure of load + return (elementsRead == 4); +} + +// clear all current index offset data in memory +void BamStandardIndex::ClearAllData(void) { + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + const int& refId = (*indexIter).first; + ClearReferenceOffsets(refId); + } +} + +// clear all index offset data for desired reference +void BamStandardIndex::ClearReferenceOffsets(const int& refId) { + + // look up refId, skip if not found + BamStandardIndexData::iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return ; + + // clear reference data + ReferenceIndex& refEntry = (*indexIter).second; + refEntry.Bins.clear(); + refEntry.Offsets.clear(); + + // set flag + m_hasFullDataCache = false; +} + +// return file position after header metadata +const off_t BamStandardIndex::DataBeginOffset(void) const { + return m_dataBeginOffset; +} + +// calculates offset(s) for a given region +bool BamStandardIndex::GetOffsets(const BamRegion& region, + const bool isRightBoundSpecified, + vector& offsets, + bool* hasAlignmentsInRegion) +{ + // return false if leftBound refID is not found in index data + if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) + return false; + + // load index data for region if not already cached + if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; + loadedOk &= SkipToReference(region.LeftRefID); + loadedOk &= LoadReference(region.LeftRefID); + if ( !loadedOk ) return false; + } + + // calculate which bins overlap this region + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); + + // get bins for this reference + BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); + if ( indexIter == m_indexData.end() ) return false; // error + const ReferenceIndex& refIndex = (*indexIter).second; + const BamBinMap& binMap = refIndex.Bins; + + // get minimum offset to consider + const LinearOffsetVector& linearOffsets = refIndex.Offsets; + const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) + ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); + + // store all alignment 'chunk' starts (file offsets) for bins in this region + for ( int i = 0; i < numBins; ++i ) { + + const uint16_t binKey = bins[i]; + map::const_iterator binIter = binMap.find(binKey); + if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { + + // iterate over chunks + const ChunkVector& chunks = (*binIter).second; + std::vector::const_iterator chunksIter = chunks.begin(); + std::vector::const_iterator chunksEnd = chunks.end(); + for ( ; chunksIter != chunksEnd; ++chunksIter) { + + // if valid chunk found, store its file offset + const Chunk& chunk = (*chunksIter); + if ( chunk.Stop > minOffset ) + offsets.push_back( chunk.Start ); + } + } + } + + // clean up memory + free(bins); + + // sort the offsets before returning + sort(offsets.begin(), offsets.end()); + + // set flag & return success + *hasAlignmentsInRegion = (offsets.size() != 0 ); + + // if cache mode set to none, dump the data we just loaded + if (m_cacheMode == BamIndex::NoIndexCaching ) + ClearReferenceOffsets(region.LeftRefID); + + // return succes + return true; +} + +// returns whether reference has alignments or no +bool BamStandardIndex::HasAlignments(const int& refId) const { + BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return false; // error + const ReferenceIndex& refEntry = (*indexIter).second; + return refEntry.HasAlignments; +} + +// return true if all index data is cached +bool BamStandardIndex::HasFullDataCache(void) const { + return m_hasFullDataCache; +} + +// returns true if index cache has data for desired reference +bool BamStandardIndex::IsDataLoaded(const int& refId) const { + + // look up refId, return false if not found + BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return false; + + // see if reference has alignments + // if not, it's not a problem to have no offset data + const ReferenceIndex& refEntry = (*indexIter).second; + if ( !refEntry.HasAlignments ) return true; + + // return whether bin map contains data + return ( !refEntry.Bins.empty() ); +} + +// attempts to use index to jump to region; returns success/fail +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // make sure left-bound position is valid + if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + return false; + + // calculate offsets for this region + // if failed, print message, set flag, and return failure + vector offsets; + if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { + fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n"); + *hasAlignmentsInRegion = false; + return false; + } + + // iterate through offsets + BamAlignment bAlignment; + bool result = true; + for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { + + // attempt seek & load first available alignment + // set flag to true if data exists + result &= m_BGZF->Seek(*o); + *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment); + + // if this alignment corresponds to desired position + // return success of seeking back to the offset before the 'current offset' (to cover overlaps) + if ( ((bAlignment.RefID == region.LeftRefID) && + ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || + (bAlignment.RefID > region.LeftRefID) ) + { + if ( o != offsets.begin() ) --o; + return m_BGZF->Seek(*o); + } + } + + // if error in jumping, print message & set flag + if ( !result ) { + fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n"); + *hasAlignmentsInRegion = false; + } + + // return success/failure + return result; +} + +// clears index data from all references except the first +void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + KeepOnlyReferenceOffsets((*indexBegin).first); +} + +// clears index data from all references except the one specified +void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) { + BamStandardIndexData::iterator mapIter = m_indexData.begin(); + BamStandardIndexData::iterator mapEnd = m_indexData.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + const int entryRefId = (*mapIter).first; + if ( entryRefId != refId ) + ClearReferenceOffsets(entryRefId); + } +} + +bool BamStandardIndex::LoadAllReferences(bool saveData) { + + // skip if data already loaded + if ( m_hasFullDataCache ) return true; + + // get number of reference sequences + uint32_t numReferences; + if ( !LoadReferenceCount((int&)numReferences) ) + return false; + + // iterate over reference entries + bool loadedOk = true; + for ( int i = 0; i < (int)numReferences; ++i ) + loadedOk &= LoadReference(i, saveData); + + // set flag + if ( loadedOk && saveData ) + m_hasFullDataCache = true; + + // return success/failure of loading references + return loadedOk; +} + +// load header data from index file, return true if loaded OK +bool BamStandardIndex::LoadHeader(void) { + + bool loadedOk = CheckMagicNumber(); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of load + return loadedOk; +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get bin ID + uint32_t binId; + elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(binId); + + // load alignment chunks for this bin + ChunkVector chunks; + bool chunksOk = LoadChunks(chunks, saveData); + + // store bin entry + if ( chunksOk && saveData ) + refEntry.Bins.insert(pair(binId, chunks)); + + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} + +bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get number of bins + int32_t numBins; + elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBins); + + // set flag + refEntry.HasAlignments = ( numBins != 0 ); + + // iterate over bins + bool binsOk = true; + for ( int i = 0; i < numBins; ++i ) + binsOk &= LoadBin(refEntry, saveData); + + // return success/failure of load + return ( (elementsRead == 1) && binsOk ); +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) { + + size_t elementsRead = 0; + + // read in chunk data + uint64_t start; + uint64_t stop; + elementsRead += fread(&start, sizeof(start), 1, m_indexStream); + elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // save data if requested + if ( saveData ) chunks.push_back( Chunk(start, stop) ); + + // return success/failure of load + return ( elementsRead == 2 ); +} + +bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) { + + size_t elementsRead = 0; + + // read in number of chunks + uint32_t numChunks; + elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numChunks); + + // initialize space for chunks if we're storing this data + if ( saveData ) chunks.reserve(numChunks); + + // iterate over chunks + bool chunksOk = true; + for ( int i = 0; i < (int)numChunks; ++i ) + chunksOk &= LoadChunk(chunks, saveData); + + // sort chunk vector + sort( chunks.begin(), chunks.end(), ChunkLessThan ); + + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} + +// load a single index linear offset entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // read in number of linear offsets + int32_t numLinearOffsets; + elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + + // set up destination vector (if we're saving the data) + LinearOffsetVector linearOffsets; + if ( saveData ) linearOffsets.reserve(numLinearOffsets); + + // iterate over linear offsets + uint64_t linearOffset; + for ( int i = 0; i < numLinearOffsets; ++i ) { + elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + if ( saveData ) linearOffsets.push_back(linearOffset); + } + + // sort linear offsets + sort ( linearOffsets.begin(), linearOffsets.end() ); + + // save in reference index entry if desired + if ( saveData ) refEntry.Offsets = linearOffsets; + + // return success/failure of load + return ( elementsRead == (size_t)(numLinearOffsets + 1) ); +} + +bool BamStandardIndex::LoadFirstReference(bool saveData) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return LoadReference((*indexBegin).first, saveData); +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadReference(const int& refId, bool saveData) { + + // look up refId + BamStandardIndexData::iterator indexIter = m_indexData.find(refId); + + // if reference not previously loaded, create new entry + if ( indexIter == m_indexData.end() ) { + ReferenceIndex newEntry; + newEntry.HasAlignments = false; + m_indexData.insert( pair(refId, newEntry) ); + } + + // load reference data + indexIter = m_indexData.find(refId); + ReferenceIndex& entry = (*indexIter).second; + bool loadedOk = true; + loadedOk &= LoadBins(entry, saveData); + loadedOk &= LoadLinearOffsets(entry, saveData); + return loadedOk; +} + +// loads number of references, return true if loaded OK +bool BamStandardIndex::LoadReferenceCount(int& numReferences) { + + size_t elementsRead = 0; + + // read reference count + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // return success/failure of load + return ( elementsRead == 1 ); +} + +// merges 'alignment chunks' in BAM bin (used for index building) +void BamStandardIndex::MergeChunks(void) { + + // iterate over reference enties + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + + // get BAM bin map for this reference + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& bamBinMap = refIndex.Bins; + + // iterate over BAM bins + BamBinMap::iterator binIter = bamBinMap.begin(); + BamBinMap::iterator binEnd = bamBinMap.end(); + for ( ; binIter != binEnd; ++binIter ) { + + // get chunk vector for this bin + ChunkVector& binChunks = (*binIter).second; + if ( binChunks.size() == 0 ) continue; + + ChunkVector mergedChunks; + mergedChunks.push_back( binChunks[0] ); + + // iterate over chunks + int i = 0; + ChunkVector::iterator chunkIter = binChunks.begin(); + ChunkVector::iterator chunkEnd = binChunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentChunk' based on numeric index + Chunk& currentChunk = mergedChunks[i]; + + // get iteratorChunk based on vector iterator + Chunk& iteratorChunk = (*chunkIter); + + // if chunk ends where (iterator) chunk starts, then merge + if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) + currentChunk.Stop = iteratorChunk.Stop; + + // otherwise + else { + // set currentChunk + 1 to iteratorChunk + mergedChunks.push_back(iteratorChunk); + ++i; + } + } + + // saved merged chunk vector + (*binIter).second = mergedChunks; + } + } +} + +// saves BAM bin entry for index +void BamStandardIndex::SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset) +{ + // look up saveBin + BamBinMap::iterator binIter = binMap.find(saveBin); + + // create new chunk + Chunk newChunk(saveOffset, lastOffset); + + // if entry doesn't exist + if ( binIter == binMap.end() ) { + ChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair(saveBin, newChunks)); + } + + // otherwise + else { + ChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} + +// saves linear offset entry for index +void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset) +{ + // get converted offsets + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) + offsets.resize(newSize, 0); + + // store offset + for( int i = beginOffset + 1; i <= endOffset; ++i ) { + if ( offsets[i] == 0 ) + offsets[i] = lastOffset; + } +} + +// initializes index data structure to hold @count references +void BamStandardIndex::SetReferenceCount(const int& count) { + for ( int i = 0; i < count; ++i ) + m_indexData[i].HasAlignments = false; +} + +bool BamStandardIndex::SkipToFirstReference(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return SkipToReference( (*indexBegin).first ); +} + +// position file pointer to desired reference begin, return true if skipped OK +bool BamStandardIndex::SkipToReference(const int& refId) { + + // attempt rewind + if ( !Rewind() ) return false; + + // read in number of references + uint32_t numReferences; + size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // iterate over reference entries + bool skippedOk = true; + int currentRefId = 0; + while (currentRefId != refId) { + skippedOk &= LoadReference(currentRefId, false); + ++currentRefId; + } + + // return success + return skippedOk; +} + +// write header to new index file +bool BamStandardIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write magic number + elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of write + return (elementsWritten == 4); +} + +// write index data for all references to new index file +bool BamStandardIndex::WriteAllReferences(void) { + + size_t elementsWritten = 0; + + // write number of reference sequences + int32_t numReferenceSeqs = m_indexData.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); + elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream); + + // iterate over reference sequences + bool refsOk = true; + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++ indexIter ) + refsOk &= WriteReference( (*indexIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && refsOk ); +} + +// write index data for bin to new index file +bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write BAM bin ID + uint32_t binKey = binId; + if ( m_isBigEndian ) SwapEndian_32(binKey); + elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); + + // write chunks + bool chunksOk = WriteChunks(chunks); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for bins to new index file +bool BamStandardIndex::WriteBins(const BamBinMap& bins) { + + size_t elementsWritten = 0; + + // write number of bins + int32_t binCount = bins.size(); + if ( m_isBigEndian ) SwapEndian_32(binCount); + elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); + + // iterate over bins + bool binsOk = true; + BamBinMap::const_iterator binIter = bins.begin(); + BamBinMap::const_iterator binEnd = bins.end(); + for ( ; binIter != binEnd; ++binIter ) + binsOk &= WriteBin( (*binIter).first, (*binIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && binsOk ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::WriteChunk(const Chunk& chunk) { + + size_t elementsWritten = 0; + + // localize alignment chunk offsets + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // write to index file + elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); + elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); + + // return success/failure of write + return ( elementsWritten == 2 ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write chunks + int32_t chunkCount = chunks.size(); + if ( m_isBigEndian ) SwapEndian_32(chunkCount); + elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); + + // iterate over chunks + bool chunksOk = true; + ChunkVector::const_iterator chunkIter = chunks.begin(); + ChunkVector::const_iterator chunkEnd = chunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) + chunksOk &= WriteChunk( (*chunkIter) ); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for linear offsets entry to new index file +bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { + + size_t elementsWritten = 0; + + // write number of linear offsets + int32_t offsetCount = offsets.size(); + if ( m_isBigEndian ) SwapEndian_32(offsetCount); + elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); + + // iterate over linear offsets + LinearOffsetVector::const_iterator offsetIter = offsets.begin(); + LinearOffsetVector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset + uint64_t linearOffset = (*offsetIter); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + } + + // return success/failure of write + return ( elementsWritten == (size_t)(offsetCount + 1) ); +} + +// write index data for a single reference to new index file +bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) { + bool refOk = true; + refOk &= WriteBins(refEntry.Bins); + refOk &= WriteLinearOffsets(refEntry.Offsets); + return refOk; +} diff --git a/src/api/internal/BamStandardIndex_p.h b/src/api/internal/BamStandardIndex_p.h new file mode 100644 index 0000000..da179f4 --- /dev/null +++ b/src/api/internal/BamStandardIndex_p.h @@ -0,0 +1,213 @@ +// *************************************************************************** +// BamStandardIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#ifndef BAM_STANDARD_INDEX_FORMAT_H +#define BAM_STANDARD_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include +#include +#include +#include +#include + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +// BAM index constants +const int MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BAM_LIDX_SHIFT = 14; + +// -------------------------------------------------- +// BamStandardIndex data structures & typedefs +struct Chunk { + + // data members + uint64_t Start; + uint64_t Stop; + + // constructor + Chunk(const uint64_t& start = 0, + const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + { } +}; + +inline +bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { + return lhs.Start < rhs.Start; +} + +typedef std::vector ChunkVector; +typedef std::map BamBinMap; +typedef std::vector LinearOffsetVector; + +struct ReferenceIndex { + + // data members + BamBinMap Bins; + LinearOffsetVector Offsets; + bool HasAlignments; + + // constructor + ReferenceIndex(const BamBinMap& binMap = BamBinMap(), + const LinearOffsetVector& offsets = LinearOffsetVector(), + const bool hasAlignments = false) + : Bins(binMap) + , Offsets(offsets) + , HasAlignments(hasAlignments) + { } +}; + +typedef std::map BamStandardIndexData; + +class BamStandardIndex : public BamIndex { + + // ctor & dtor + public: + BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); + ~BamStandardIndex(void); + + // interface (implements BamIndex virtual methods) + public: + // creates index data (in-memory) from current reader data + bool Build(void); + // returns supported file extension + const std::string Extension(void) const { return std::string(".bai"); } + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index to jump to region; returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + public: + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); + + // 'internal' methods + public: + + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); + bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadChunk(ChunkVector& chunks, bool saveData = true); + bool LoadChunks(ChunkVector& chunks, bool saveData = true); + // load a single index linear offset entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write index data for bin to new index file + bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); + // write index data for bins to new index file + bool WriteBins(const BamBinMap& bins); + // write index data for chunk entry to new index file + bool WriteChunk(const Chunk& chunk); + // write index data for chunk entry to new index file + bool WriteChunks(const ChunkVector& chunks); + // write index data for linear offsets entry to new index file + bool WriteLinearOffsets(const LinearOffsetVector& offsets); + // write index data single reference to new index file + bool WriteReference(const ReferenceIndex& refEntry); + + // ----------------------- + // index data operations + + // calculate bins that overlap region + int BinsFromRegion(const BamRegion& region, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]); + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculates offset(s) for a given region + bool GetOffsets(const BamRegion& region, + const bool isRightBoundSpecified, + std::vector& offsets, + bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // simplifies index by merging 'chunks' + void MergeChunks(void); + // saves BAM bin entry for index + void SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset); + // saves linear offset entry for index + void SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); + + // data members + private: + + BamStandardIndexData m_indexData; + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAM_STANDARD_INDEX_FORMAT_H diff --git a/src/api/internal/BamToolsIndex_p.cpp b/src/api/internal/BamToolsIndex_p.cpp new file mode 100644 index 0000000..5590e8e --- /dev/null +++ b/src/api/internal/BamToolsIndex_p.cpp @@ -0,0 +1,577 @@ +// *************************************************************************** +// BamToolsIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +#include +using namespace std; + +BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader) + : BamIndex(bgzf, reader) + , m_blockSize(1000) + , m_dataBeginOffset(0) + , m_hasFullDataCache(false) + , m_inputVersion(0) + , m_outputVersion(BTI_1_2) // latest version - used for writing new index files +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamToolsIndex::~BamToolsIndex(void) { + ClearAllData(); +} + +// creates index data (in-memory) from current reader data +bool BamToolsIndex::Build(void) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // move file pointer to beginning of alignments + if ( !m_reader->Rewind() ) return false; + + // initialize index data structure with space for all references + const int numReferences = (int)m_references.size(); + m_indexData.clear(); + m_hasFullDataCache = false; + SetReferenceCount(numReferences); + + // set up counters and markers + int32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = m_BGZF->Tell(); + int32_t blockRefId = 0; + int32_t blockMaxEndPosition = 0; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; + + // plow through alignments, storing index entries + BamAlignment al; + while ( m_reader->GetNextAlignmentCore(al) ) { + + // if block contains data (not the first time through) AND alignment is on a new reference + if ( currentBlockCount > 0 && al.RefID != blockRefId ) { + + // store previous data + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); + + // intialize new block for current alignment's reference + currentBlockCount = 0; + blockMaxEndPosition = al.GetEndPosition(); + blockStartOffset = currentAlignmentOffset; + } + + // if beginning of block, save first alignment's refID & position + if ( currentBlockCount == 0 ) { + blockRefId = al.RefID; + blockStartPosition = al.Position; + } + + // increment block counter + ++currentBlockCount; + + // check end position + int32_t alignmentEndPosition = al.GetEndPosition(); + if ( alignmentEndPosition > blockMaxEndPosition ) + blockMaxEndPosition = alignmentEndPosition; + + // if block is full, get offset for next block, reset currentBlockCount + if ( currentBlockCount == m_blockSize ) { + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); + blockStartOffset = m_BGZF->Tell(); + currentBlockCount = 0; + } + + // not the best name, but for the next iteration, this value will be the offset of the *current* alignment + // necessary because we won't know if this next alignment is on a new reference until we actually read it + currentAlignmentOffset = m_BGZF->Tell(); + } + + // store final block with data + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); + + // set flag + m_hasFullDataCache = true; + + // return success/failure of rewind + return m_reader->Rewind(); +} + +// check index file magic number, return true if OK +bool BamToolsIndex::CheckMagicNumber(void) { + + // see if index is valid BAM index + char magic[4]; + size_t elementsRead = fread(magic, 1, 4, m_indexStream); + if ( elementsRead != 4 ) return false; + if ( strncmp(magic, "BTI\1", 4) != 0 ) { + fprintf(stderr, "Problem with index file - invalid format.\n"); + return false; + } + + // otherwise ok + return true; +} + +// check index file version, return true if OK +bool BamToolsIndex::CheckVersion(void) { + + // read version from file + size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); + + // if version is negative, or zero + if ( m_inputVersion <= 0 ) { + fprintf(stderr, "Problem with index file - invalid version.\n"); + return false; + } + + // if version is newer than can be supported by this version of bamtools + else if ( m_inputVersion > m_outputVersion ) { + fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n"); + fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); + return false; + } + + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (typically whose format did not accomodate a particular bug fix) + + else if ( (Version)m_inputVersion == BTI_1_0 ) { + fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + return false; + } + + else if ( (Version)m_inputVersion == BTI_1_1 ) { + fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + return false; + } + + // otherwise ok + else return true; +} + +// clear all current index offset data in memory +void BamToolsIndex::ClearAllData(void) { + BamToolsIndexData::const_iterator indexIter = m_indexData.begin(); + BamToolsIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + const int& refId = (*indexIter).first; + ClearReferenceOffsets(refId); + } +} + +// clear all index offset data for desired reference +void BamToolsIndex::ClearReferenceOffsets(const int& refId) { + if ( m_indexData.find(refId) == m_indexData.end() ) return; + vector& offsets = m_indexData[refId].Offsets; + offsets.clear(); + m_hasFullDataCache = false; +} + +// return file position after header metadata +const off_t BamToolsIndex::DataBeginOffset(void) const { + return m_dataBeginOffset; +} + +// calculate BAM file offset for desired region +// return true if no error (*NOT* equivalent to "has alignments or valid offset") +// check @hasAlignmentsInRegion to determine this status +// @region - target region +// @offset - resulting seek target +// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status +// N.B. - ignores isRightBoundSpecified +bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { + + // return false if leftBound refID is not found in index data + BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); + if ( indexIter == m_indexData.end()) return false; + + // load index data for region if not already cached + if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; + loadedOk &= SkipToReference(region.LeftRefID); + loadedOk &= LoadReference(region.LeftRefID); + if ( !loadedOk ) return false; + } + + // localize index data for this reference (& sanity check that data actually exists) + indexIter = m_indexData.find(region.LeftRefID); + if ( indexIter == m_indexData.end()) return false; + const vector& referenceOffsets = (*indexIter).second.Offsets; + if ( referenceOffsets.empty() ) return false; + + // ------------------------------------------------------- + // calculate nearest index to jump to + + // save first offset + offset = (*referenceOffsets.begin()).StartOffset; + + // iterate over offsets entries on this reference + vector::const_iterator offsetIter = referenceOffsets.begin(); + vector::const_iterator offsetEnd = referenceOffsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + const BamToolsIndexEntry& entry = (*offsetIter); + // break if alignment 'entry' overlaps region + if ( entry.MaxEndPosition >= region.LeftPosition ) break; + offset = (*offsetIter).StartOffset; + } + + // set flag based on whether an index entry was found for this region + *hasAlignmentsInRegion = ( offsetIter != offsetEnd ); + + // if cache mode set to none, dump the data we just loaded + if (m_cacheMode == BamIndex::NoIndexCaching ) + ClearReferenceOffsets(region.LeftRefID); + + // return success + return true; +} + +// returns whether reference has alignments or no +bool BamToolsIndex::HasAlignments(const int& refId) const { + + BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end()) return false; + const BamToolsReferenceEntry& refEntry = (*indexIter).second; + return refEntry.HasAlignments; +} + +// return true if all index data is cached +bool BamToolsIndex::HasFullDataCache(void) const { + return m_hasFullDataCache; +} + +// returns true if index cache has data for desired reference +bool BamToolsIndex::IsDataLoaded(const int& refId) const { + + BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end()) return false; + const BamToolsReferenceEntry& refEntry = (*indexIter).second; + + if ( !refEntry.HasAlignments ) return true; // no data period + + // return whether offsets list contains data + return !refEntry.Offsets.empty(); +} + +// attempts to use index to jump to region; returns success/fail +bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // clear flag + *hasAlignmentsInRegion = false; + + // check valid BamReader state + if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { + fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); + return false; + } + + // make sure left-bound position is valid + if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + return false; + + // calculate nearest offset to jump to + int64_t offset; + if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { + fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n"); + return false; + } + + // return success/failure of seek + return m_BGZF->Seek(offset); +} + +// clears index data from all references except the first +void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + KeepOnlyReferenceOffsets( (*indexBegin).first ); +} + +// clears index data from all references except the one specified +void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) { + BamToolsIndexData::iterator mapIter = m_indexData.begin(); + BamToolsIndexData::iterator mapEnd = m_indexData.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + const int entryRefId = (*mapIter).first; + if ( entryRefId != refId ) + ClearReferenceOffsets(entryRefId); + } +} + +// load index data for all references, return true if loaded OK +bool BamToolsIndex::LoadAllReferences(bool saveData) { + + // skip if data already loaded + if ( m_hasFullDataCache ) return true; + + // read in number of references + int32_t numReferences; + if ( !LoadReferenceCount(numReferences) ) return false; + //SetReferenceCount(numReferences); + + // iterate over reference entries + bool loadedOk = true; + for ( int i = 0; i < numReferences; ++i ) + loadedOk &= LoadReference(i, saveData); + + // set flag + if ( loadedOk && saveData ) + m_hasFullDataCache = true; + + // return success/failure of load + return loadedOk; +} + +// load header data from index file, return true if loaded OK +bool BamToolsIndex::LoadHeader(void) { + + // check magic number + if ( !CheckMagicNumber() ) return false; + + // check BTI version + if ( !CheckVersion() ) return false; + + // read in block size + size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_blockSize); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of load + return (elementsRead == 1); +} + +// load a single index entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) { + + // read in index entry data members + size_t elementsRead = 0; + BamToolsIndexEntry entry; + elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream); + elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream); + elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream); + if ( elementsRead != 3 ) { + cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl; + return false; + } + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(entry.MaxEndPosition); + SwapEndian_64(entry.StartOffset); + SwapEndian_32(entry.StartPosition); + } + + // save data + if ( saveData ) + SaveOffsetEntry(refId, entry); + + // return success/failure of load + return true; +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::LoadFirstReference(bool saveData) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + return LoadReference( (*indexBegin).first, saveData ); +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::LoadReference(const int& refId, bool saveData) { + + // read in number of offsets for this reference + uint32_t numOffsets; + size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numOffsets); + + // initialize offsets container for this reference + SetOffsetCount(refId, (int)numOffsets); + + // iterate over offset entries + for ( unsigned int j = 0; j < numOffsets; ++j ) + LoadIndexEntry(refId, saveData); + + // return success/failure of load + return true; +} + +// loads number of references, return true if loaded OK +bool BamToolsIndex::LoadReferenceCount(int& numReferences) { + + size_t elementsRead = 0; + + // read reference count + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // return success/failure of load + return ( elementsRead == 1 ); +} + +// saves an index offset entry in memory +void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) { + BamToolsReferenceEntry& refEntry = m_indexData[refId]; + refEntry.HasAlignments = true; + refEntry.Offsets.push_back(entry); +} + +// pre-allocates size for offset vector +void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) { + BamToolsReferenceEntry& refEntry = m_indexData[refId]; + refEntry.Offsets.reserve(offsetCount); + refEntry.HasAlignments = ( offsetCount > 0); +} + +// initializes index data structure to hold @count references +void BamToolsIndex::SetReferenceCount(const int& count) { + for ( int i = 0; i < count; ++i ) + m_indexData[i].HasAlignments = false; +} + +// position file pointer to first reference begin, return true if skipped OK +bool BamToolsIndex::SkipToFirstReference(void) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + return SkipToReference( (*indexBegin).first ); +} + +// position file pointer to desired reference begin, return true if skipped OK +bool BamToolsIndex::SkipToReference(const int& refId) { + + // attempt rewind + if ( !Rewind() ) return false; + + // read in number of references + int32_t numReferences; + size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // iterate over reference entries + bool skippedOk = true; + int currentRefId = 0; + while (currentRefId != refId) { + skippedOk &= LoadReference(currentRefId, false); + ++currentRefId; + } + + // return success/failure of skip + return skippedOk; +} + +// write header to new index file +bool BamToolsIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write BTI index format 'magic number' + elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream); + + // write BTI index format version + int32_t currentVersion = (int32_t)m_outputVersion; + if ( m_isBigEndian ) SwapEndian_32(currentVersion); + elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream); + + // write block size + int32_t blockSize = m_blockSize; + if ( m_isBigEndian ) SwapEndian_32(blockSize); + elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of write + return ( elementsWritten == 6 ); +} + +// write index data for all references to new index file +bool BamToolsIndex::WriteAllReferences(void) { + + size_t elementsWritten = 0; + + // write number of references + int32_t numReferences = (int32_t)m_indexData.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + + // iterate through references in index + bool refOk = true; + BamToolsIndexData::const_iterator refIter = m_indexData.begin(); + BamToolsIndexData::const_iterator refEnd = m_indexData.end(); + for ( ; refIter != refEnd; ++refIter ) + refOk &= WriteReferenceEntry( (*refIter).second ); + + return ( (elementsWritten == 1) && refOk ); +} + +// write current reference index data to new index file +bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) { + + size_t elementsWritten = 0; + + // write number of offsets listed for this reference + uint32_t numOffsets = refEntry.Offsets.size(); + if ( m_isBigEndian ) SwapEndian_32(numOffsets); + elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream); + + // iterate over offset entries + bool entriesOk = true; + vector::const_iterator offsetIter = refEntry.Offsets.begin(); + vector::const_iterator offsetEnd = refEntry.Offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) + entriesOk &= WriteIndexEntry( (*offsetIter) ); + + return ( (elementsWritten == 1) && entriesOk ); +} + +// write current index offset entry to new index file +bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) { + + // copy entry data + int32_t maxEndPosition = entry.MaxEndPosition; + int64_t startOffset = entry.StartOffset; + int32_t startPosition = entry.StartPosition; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); + } + + // write the reference index entry + size_t elementsWritten = 0; + elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); + elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); + elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); + return ( elementsWritten == 3 ); +} diff --git a/src/api/internal/BamToolsIndex_p.h b/src/api/internal/BamToolsIndex_p.h new file mode 100644 index 0000000..c99834d --- /dev/null +++ b/src/api/internal/BamToolsIndex_p.h @@ -0,0 +1,192 @@ +// *************************************************************************** +// BamToolsIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#ifndef BAMTOOLS_INDEX_FORMAT_H +#define BAMTOOLS_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include +#include +#include +#include +#include + +namespace BamTools { + +namespace Internal { + +// individual index offset entry +struct BamToolsIndexEntry { + + // data members + int32_t MaxEndPosition; + int64_t StartOffset; + int32_t StartPosition; + + // ctor + BamToolsIndexEntry(const int32_t& maxEndPosition = 0, + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) + { } +}; + +// reference index entry +struct BamToolsReferenceEntry { + + // data members + bool HasAlignments; + std::vector Offsets; + + // ctor + BamToolsReferenceEntry(void) + : HasAlignments(false) + { } +}; + +// the actual index data structure +typedef std::map BamToolsIndexData; + +class BamToolsIndex : public BamIndex { + + // keep a list of any supported versions here + // (might be useful later to handle any 'legacy' versions if the format changes) + // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on + // + // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by: + // + // if ( indexVersion >= BTI_1_2 ) + // do something new + // else + // do the old thing + enum Version { BTI_1_0 = 1 + , BTI_1_1 + , BTI_1_2 + }; + + + // ctor & dtor + public: + BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); + ~BamToolsIndex(void); + + // interface (implements BamIndex virtual methods) + public: + // creates index data (in-memory) from current reader data + bool Build(void); + // returns supported file extension + const std::string Extension(void) const { return std::string(".bti"); } + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index to jump to region; returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + public: + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); + + // 'internal' methods + public: + + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // return true if FILE* is open + bool IsOpen(void) const; + // load a single index entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadIndexEntry(const int& refId, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write current reference index data to new index file + bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); + // write current index offset entry to new index file + bool WriteIndexEntry(const BamToolsIndexEntry& entry); + + // ----------------------- + // index data operations + + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculate BAM file offset for desired region + // return true if no error (*NOT* equivalent to "has alignments or valid offset") + // check @hasAlignmentsInRegion to determine this status + // @region - target region + // @offset - resulting seek target + // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status + bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // saves an index offset entry in memory + void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); + // pre-allocates size for offset vector + void SetOffsetCount(const int& refId, const int& offsetCount); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); + + // data members + private: + int32_t m_blockSize; + BamToolsIndexData m_indexData; + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMTOOLS_INDEX_FORMAT_H diff --git a/src/api/internal/BamWriter_p.cpp b/src/api/internal/BamWriter_p.cpp new file mode 100644 index 0000000..a75eb44 --- /dev/null +++ b/src/api/internal/BamWriter_p.cpp @@ -0,0 +1,379 @@ +// *************************************************************************** +// BamWriter_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; +using namespace std; + +BamWriterPrivate::BamWriterPrivate(void) { + IsBigEndian = SystemIsBigEndian(); +} + +BamWriterPrivate::~BamWriterPrivate(void) { + mBGZF.Close(); +} + +// closes the alignment archive +void BamWriterPrivate::Close(void) { + mBGZF.Close(); +} + +// calculates minimum bin for a BAM alignment interval +const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { + --end; + if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + return 0; +} + +// creates a cigar string from the supplied alignment +void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { + + // initialize + const unsigned int numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + unsigned int cigarOp; + vector::const_iterator coIter; + for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { + + switch(coIter->Type) { + case 'M': + cigarOp = BAM_CMATCH; + break; + case 'I': + cigarOp = BAM_CINS; + break; + case 'D': + cigarOp = BAM_CDEL; + break; + case 'N': + cigarOp = BAM_CREF_SKIP; + break; + case 'S': + cigarOp = BAM_CSOFT_CLIP; + break; + case 'H': + cigarOp = BAM_CHARD_CLIP; + break; + case 'P': + cigarOp = BAM_CPAD; + break; + default: + fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const unsigned int queryLen = query.size(); + const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); + encodedQuery.resize(encodedQueryLen); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + unsigned char nucleotideCode; + bool useHighWord = true; + + while(*pQuery) { + + switch(*pQuery) { + + case '=': + nucleotideCode = 0; + break; + + case 'A': + nucleotideCode = 1; + break; + + case 'C': + nucleotideCode = 2; + break; + + case 'G': + nucleotideCode = 4; + break; + + case 'T': + nucleotideCode = 8; + break; + + case 'N': + nucleotideCode = 15; + break; + + default: + fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if(useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + pEncodedQuery++; + useHighWord = true; + } + + // increment the query position + pQuery++; + } +} + +// opens the alignment archive +bool BamWriterPrivate::Open(const string& filename, + const string& samHeader, + const RefVector& referenceSequences, + bool isWriteUncompressed) +{ + // open the BGZF file for writing, return failure if error + if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) + return false; + + // ================ + // write the header + // ================ + + // write the BAM signature + const unsigned char SIGNATURE_LENGTH = 4; + const char* BAM_SIGNATURE = "BAM\1"; + mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); + + // write the SAM header text length + uint32_t samHeaderLen = samHeader.size(); + if (IsBigEndian) SwapEndian_32(samHeaderLen); + mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); + + // write the SAM header text + if(samHeaderLen > 0) + mBGZF.Write(samHeader.data(), samHeaderLen); + + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if (IsBigEndian) SwapEndian_32(numReferenceSequences); + mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); + + // ============================= + // write the sequence dictionary + // ============================= + + RefVector::const_iterator rsIter; + for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { + + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); + mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); + + // write the reference sequence name + mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if (IsBigEndian) SwapEndian_32(referenceLength); + mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); + } + + // return success + return true; +} + +// saves the alignment to the alignment archive +void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { + + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the raw char data + mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); + } + + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) + else { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; + unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the query name + mBGZF.Write(al.Name.c_str(), nameLength); + + // write the packed cigar + if ( IsBigEndian ) { + + char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + + for (unsigned int i = 0; i < packedCigarLength; ++i) { + if ( IsBigEndian ) + SwapEndian_32p(&cigarData[i]); + } + + mBGZF.Write(cigarData, packedCigarLength); + free(cigarData); + } + else + mBGZF.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + mBGZF.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + string baseQualities(al.Qualities); + char* pBaseQualities = (char*)al.Qualities.data(); + for(unsigned int i = 0; i < queryLength; i++) { + pBaseQualities[i] -= 33; + } + mBGZF.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( IsBigEndian ) { + + char* tagData = (char*)calloc(sizeof(char), tagDataLength); + memcpy(tagData, al.TagData.data(), tagDataLength); + + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here + free(tagData); + exit(1); + } + } + + mBGZF.Write(tagData, tagDataLength); + free(tagData); + } + else + mBGZF.Write(al.TagData.data(), tagDataLength); + } +} diff --git a/src/api/internal/BamWriter_p.h b/src/api/internal/BamWriter_p.h new file mode 100644 index 0000000..4fe626a --- /dev/null +++ b/src/api/internal/BamWriter_p.h @@ -0,0 +1,63 @@ +// *************************************************************************** +// BamWriter_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_P_H +#define BAMWRITER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include +#include +#include +#include + +namespace BamTools { +namespace Internal { + +class BamWriterPrivate { + + // ctor & dtor + public: + BamWriterPrivate(void); + ~BamWriterPrivate(void); + + // "public" interface to BamWriter + public: + void Close(void); + bool Open(const std::string& filename, + const std::string& samHeader, + const BamTools::RefVector& referenceSequences, + bool isWriteUncompressed); + void SaveAlignment(const BamAlignment& al); + + // internal methods + public: + const unsigned int CalculateMinimumBin(const int begin, int end) const; + void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); + void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + + // data members + public: + BgzfData mBGZF; + bool IsBigEndian; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMWRITER_P_H