From: derek Date: Mon, 28 Nov 2011 23:55:31 +0000 (-0500) Subject: merge with remoteio branch X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=8077f86ef52bfb08c17430b797c737d217d41cf3;p=bamtools.git merge with remoteio branch --- 8077f86ef52bfb08c17430b797c737d217d41cf3 diff --cc src/api/CMakeLists.txt index c8504ed,4500564..539feca --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@@ -49,41 -33,49 +33,51 @@@ set( BamToolsAPISource # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) -set_target_properties( BamTools PROPERTIES SOVERSION "2.0.5" ) -set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) +set_target_properties( BamTools PROPERTIES + SOVERSION "2.0.5" + OUTPUT_NAME "bamtools" ) - target_link_libraries( BamTools z ) - install( TARGETS BamTools LIBRARY DESTINATION "lib/bamtools" RUNTIME DESTINATION "bin" ) # create main BamTools API static library add_library( BamTools-static STATIC ${BamToolsAPISources} ) - set_target_properties( BamTools-static PROPERTIES - OUTPUT_NAME "bamtools" -set_target_properties( BamTools-static PROPERTIES OUTPUT_NAME "bamtools" ) -set_target_properties( BamTools-static PROPERTIES PREFIX "lib" ) ++set_target_properties( BamTools-static PROPERTIES ++ OUTPUT_NAME "bamtools" + PREFIX "lib" ) - target_link_libraries( BamTools-static z ) - install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools" ) + + # link libraries with zlib automatically + if ( _WIN32 ) + set( APILibs z ws2_32 ) + else ( _WIN32 ) + set( APILibs z ) + endif ( _WIN32 ) + + target_link_libraries( BamTools ${APILibs} ) + target_link_libraries( BamTools-static ${APILibs} ) + + # set library install destinations + install( TARGETS BamTools LIBRARY DESTINATION "lib/bamtools" RUNTIME DESTINATION "bin") + install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools") # export API headers - include( ../ExportHeader.cmake ) - set( ApiIncludeDir "api" ) - ExportHeader( APIHeaders api_global.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamAlgorithms.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamAlignment.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamAux.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamConstants.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamIndex.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamMultiReader.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamReader.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders BamWriter.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders IBamIODevice.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamConstants.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamHeader.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamProgram.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamProgramChain.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamReadGroup.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamSequence.h ${ApiIncludeDir} ) - ExportHeader( APIHeaders SamSequenceDictionary.h ${ApiIncludeDir} ) + include(../ExportHeader.cmake) + set(ApiIncludeDir "api") + ExportHeader(APIHeaders api_global.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamAlgorithms.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamConstants.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) + ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) + ExportHeader(APIHeaders IBamIODevice.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamProgram.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamProgramChain.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir}) + ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir}) -set(AlgorithmsIncludeDir "api/algorithms") -ExportHeader(AlgorithmsHeaders algorithms/Sort.h ${AlgorithmsIncludeDir}) +set( AlgorithmsIncludeDir "api/algorithms" ) +ExportHeader( AlgorithmsHeaders algorithms/Sort.h ${AlgorithmsIncludeDir} ) diff --cc src/api/internal/bam/BamReader_p.cpp index 0000000,6484a10..6904da7 mode 000000,100644..100644 --- a/src/api/internal/bam/BamReader_p.cpp +++ b/src/api/internal/bam/BamReader_p.cpp @@@ -1,0 -1,466 +1,469 @@@ + // *************************************************************************** + // BamReader_p.cpp (c) 2009 Derek Barnett + // Marth Lab, Department of Biology, Boston College + // --------------------------------------------------------------------------- ++<<<<<<< HEAD:src/api/internal/BamReader_p.cpp ++// Last modified: 14 November 2011 (DB) ++======= + // Last modified: 25 October 2011 (DB) ++>>>>>>> remoteio:src/api/internal/bam/BamReader_p.cpp + // --------------------------------------------------------------------------- + // Provides the basic functionality for reading BAM files + // *************************************************************************** + + #include "api/BamConstants.h" + #include "api/BamReader.h" + #include "api/IBamIODevice.h" + #include "api/internal/bam/BamHeader_p.h" + #include "api/internal/bam/BamRandomAccessController_p.h" + #include "api/internal/bam/BamReader_p.h" + #include "api/internal/index/BamStandardIndex_p.h" + #include "api/internal/index/BamToolsIndex_p.h" + #include "api/internal/io/BamDeviceFactory_p.h" + #include "api/internal/utils/BamException_p.h" + using namespace BamTools; + using namespace BamTools::Internal; + + #include + #include + #include + #include + #include + using namespace std; + + // constructor + BamReaderPrivate::BamReaderPrivate(BamReader* parent) + : m_alignmentsBeginOffset(0) + , m_parent(parent) + { + m_isBigEndian = BamTools::SystemIsBigEndian(); + } + + // destructor + BamReaderPrivate::~BamReaderPrivate(void) { + Close(); + } + + // closes the BAM file + bool BamReaderPrivate::Close(void) { + + // clear BAM metadata + m_references.clear(); + m_header.Clear(); + + // clear filename + m_filename.clear(); + + // close random access controller + m_randomAccessController.Close(); + + // if stream is open, attempt close + if ( IsOpen() ) { + try { + m_stream.Close(); + } catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("encountered error closing BAM file: \n\t") + streamError; + SetErrorString("BamReader::Close", message); + return false; + } + } + + // return success + return true; + } + + // creates an index file of requested type on current BAM file + bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) { + + // skip if BAM file not open + if ( !IsOpen() ) { + SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file"); + return false; + } + + // attempt to create index + if ( m_randomAccessController.CreateIndex(this, type) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not create index: \n\t") + bracError; + SetErrorString("BamReader::CreateIndex", message); + return false; + } + } + + // return path & filename of current BAM file + const string BamReaderPrivate::Filename(void) const { + return m_filename; + } + + string BamReaderPrivate::GetErrorString(void) const { + return m_errorString; + } + + // return header data as std::string + string BamReaderPrivate::GetHeaderText(void) const { + return m_header.ToString(); + } + + // return header data as SamHeader object + SamHeader BamReaderPrivate::GetSamHeader(void) const { + return m_header.ToSamHeader(); + } + + // get next alignment (with character data fully parsed) + bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { + + // if valid alignment found + if ( GetNextAlignmentCore(alignment) ) { + + // store alignment's "source" filename + alignment.Filename = m_filename; + + // return success/failure of parsing char data + if ( alignment.BuildCharData() ) + return true; + else { + const string alError = alignment.GetErrorString(); + const string message = string("could not populate alignment data: \n\t") + alError; + SetErrorString("BamReader::GetNextAlignment", message); + return false; + } + } + + // no valid alignment found + return false; + } + + // retrieves next available alignment core data (returns success/fail) + // ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename) + // these can be accessed, if necessary, from the supportData + // useful for operations requiring ONLY positional or other alignment-related information + bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) { + + // skip if stream not opened + if ( !m_stream.IsOpen() ) + return false; + + try { + + // skip if region is set but has no alignments + if ( m_randomAccessController.HasRegion() && + !m_randomAccessController.RegionHasAlignments() ) + { + return false; + } + + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; + + // check alignment's region-overlap state + BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; + + // read until overlap is found + while ( state != BamRandomAccessController::OverlapsRegion ) { + + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; + + // check alignment's region-overlap state + state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; + } + + // if we get here, we found the next 'valid' alignment + // (e.g. overlaps current region if one was set, simply the next alignment if not) + alignment.SupportData.HasCoreOnly = true; + return true; + + } catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("encountered error reading BAM alignment: \n\t") + streamError; + SetErrorString("BamReader::GetNextAlignmentCore", message); + return false; + } + } + + int BamReaderPrivate::GetReferenceCount(void) const { + return m_references.size(); + } + + const RefVector& BamReaderPrivate::GetReferenceData(void) const { + return m_references; + } + + // returns RefID for given RefName (returns References.size() if not found) + int BamReaderPrivate::GetReferenceID(const string& refName) const { + + // retrieve names from reference data + vector refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for ( ; refIter != refEnd; ++refIter) + refNames.push_back( (*refIter).RefName ); + + // return 'index-of' refName (or -1 if not found) + int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); + if ( index == (int)m_references.size() ) return -1; + else return index; + } + + bool BamReaderPrivate::HasIndex(void) const { + return m_randomAccessController.HasIndex(); + } + + bool BamReaderPrivate::IsOpen(void) const { + return m_stream.IsOpen(); + } + + // load BAM header data + void BamReaderPrivate::LoadHeaderData(void) { + m_header.Load(&m_stream); + } + + // populates BamAlignment with alignment data under file pointer, returns success/fail + bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { + + // read in the 'block length' value, make sure it's not zero + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength); + if ( alignment.SupportData.BlockLength == 0 ) + return false; + + // read in core alignment data, make sure the right size of data was read + char x[Constants::BAM_CORE_SIZE]; + if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE ) + return false; + + // swap core endian-ness if necessary + if ( m_isBigEndian ) { + for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) ) + BamTools::SwapEndian_32p(&x[i]); + } + + // set BamAlignment 'core' and 'support' data + alignment.RefID = BamTools::UnpackSignedInt(&x[0]); + alignment.Position = BamTools::UnpackSignedInt(&x[4]); + + unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]); + alignment.Bin = tempValue >> 16; + alignment.MapQuality = tempValue >> 8 & 0xff; + alignment.SupportData.QueryNameLength = tempValue & 0xff; + + tempValue = BamTools::UnpackUnsignedInt(&x[12]); + alignment.AlignmentFlag = tempValue >> 16; + alignment.SupportData.NumCigarOperations = tempValue & 0xffff; + + alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]); + alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]); + alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]); + alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]); + + // set BamAlignment length + alignment.Length = alignment.SupportData.QuerySequenceLength; + + // read in character data - make sure proper data size was read + bool readCharDataOK = false; + const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + RaiiBuffer allCharData(dataLength); + + if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) { + + // store 'allCharData' in supportData structure + alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength); + + // set success flag + readCharDataOK = true; + + // save CIGAR ops + // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, + // even when GetNextAlignmentCore() is called + const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength; + uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset); + CigarOp op; + alignment.CigarData.clear(); + alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations); + for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) { + + // swap endian-ness if necessary + if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]); + + // build CigarOp structure + op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT); + op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ]; + + // save CigarOp + alignment.CigarData.push_back(op); + } + } + + // return success/failure + return readCharDataOK; + } + + // loads reference data from BAM file + bool BamReaderPrivate::LoadReferenceData(void) { + + // get number of reference sequences + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs); + m_references.reserve((int)numberRefSeqs); + + // iterate over all references in header + for ( unsigned int i = 0; i != numberRefSeqs; ++i ) { + + // get length of reference name + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength); + RaiiBuffer refName(refNameLength); + + // get reference name and reference sequence length + m_stream.Read(refName.Buffer, refNameLength); + m_stream.Read(buffer, sizeof(int32_t)); + int32_t refLength = BamTools::UnpackSignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName.Buffer); + aReference.RefLength = refLength; + m_references.push_back(aReference); + } + + // return success + return true; + } + + bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) { + + if ( m_randomAccessController.LocateIndex(this, preferredType) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not locate index: \n\t") + bracError; + SetErrorString("BamReader::LocateIndex", message); + return false; + } + } + + // opens BAM file (and index) + bool BamReaderPrivate::Open(const string& filename) { + + try { + + // make sure we're starting with fresh state + Close(); + + // open BgzfStream + m_stream.Open(filename, IBamIODevice::ReadOnly); - assert(m_stream); + + // load BAM metadata + LoadHeaderData(); + LoadReferenceData(); + + // store filename & offset of first alignment + m_filename = filename; + m_alignmentsBeginOffset = m_stream.Tell(); + + // return success + return true; + + } catch ( BamException& e ) { + const string error = e.what(); + const string message = string("could not open file: ") + filename + + "\n\t" + error; + SetErrorString("BamReader::Open", message); + return false; + } + } + + bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { + + if ( m_randomAccessController.OpenIndex(indexFilename, this) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not open index: \n\t") + bracError; + SetErrorString("BamReader::OpenIndex", message); + return false; + } + } + + // returns BAM file pointer to beginning of alignment data + bool BamReaderPrivate::Rewind(void) { + + // reset region + m_randomAccessController.ClearRegion(); + + // return status of seeking back to first alignment + if ( Seek(m_alignmentsBeginOffset) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("could not rewind: \n\t") + currentError; + SetErrorString("BamReader::Rewind", message); + return false; + } + } + + bool BamReaderPrivate::Seek(const int64_t& position) { + + // skip if BAM file not open + if ( !IsOpen() ) { + SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file"); + return false; + } + + try { + m_stream.Seek(position); + return true; + } + catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("could not seek in BAM file: \n\t") + streamError; + SetErrorString("BamReader::Seek", message); + return false; + } + } + + void BamReaderPrivate::SetErrorString(const string& where, const string& what) { + static const string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; + } + + void BamReaderPrivate::SetIndex(BamIndex* index) { + m_randomAccessController.SetIndex(index); + } + + // sets current region & attempts to jump to it + // returns success/failure + bool BamReaderPrivate::SetRegion(const BamRegion& region) { + + if ( m_randomAccessController.SetRegion(region, m_references.size()) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not set region: \n\t") + bracError; + SetErrorString("BamReader::SetRegion", message); + return false; + } + } + + int64_t BamReaderPrivate::Tell(void) const { + return m_stream.Tell(); + } diff --cc src/api/internal/index/BamStandardIndex_p.h index 0000000,8322e5f..273d56e mode 000000,100644..100644 --- a/src/api/internal/index/BamStandardIndex_p.h +++ b/src/api/internal/index/BamStandardIndex_p.h @@@ -1,0 -1,236 +1,237 @@@ + // *************************************************************************** + // BamStandardIndex.h (c) 2010 Derek Barnett + // Marth Lab, Department of Biology, Boston College + // --------------------------------------------------------------------------- + // Last modified: 10 November 2011 (DB) + // --------------------------------------------------------------------------- + // Provides index operations for the standardized BAM index format (".bai") + // *************************************************************************** + + #ifndef BAM_STANDARD_INDEX_FORMAT_H + #define BAM_STANDARD_INDEX_FORMAT_H + + // ------------- + // W A R N I N G + // ------------- + // + // This file is not part of the BamTools API. It exists purely as an + // implementation detail. This header file may change from version to + // version without notice, or even be removed. + // + // We mean it. + + #include "api/BamAux.h" + #include "api/BamIndex.h" + #include "api/IBamIODevice.h" + #include + #include + #include + #include + + namespace BamTools { + namespace Internal { + + // ----------------------------------------------------------------------------- + // BamStandardIndex data structures + + // defines start and end of a contiguous run of alignments + struct BaiAlignmentChunk { + + // data members + uint64_t Start; + uint64_t Stop; + + // constructor + BaiAlignmentChunk(const uint64_t& start = 0, + const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + { } + }; + + // comparison operator (for sorting) + inline + bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) { + return lhs.Start < rhs.Start; + } + + // convenience typedef for a list of all alignment 'chunks' in a BAI bin + typedef std::vector BaiAlignmentChunkVector; + + // convenience typedef for a map of all BAI bins in a reference (ID => chunks) + typedef std::map BaiBinMap; + + // convenience typedef for a list of all 'linear offsets' in a reference + typedef std::vector BaiLinearOffsetVector; + + // contains all fields necessary for building, loading, & writing + // full BAI index data for a single reference + struct BaiReferenceEntry { + + // data members + int32_t ID; + BaiBinMap Bins; + BaiLinearOffsetVector LinearOffsets; + + // ctor + BaiReferenceEntry(const int32_t& id = -1) + : ID(id) + { } + }; + + // provides (persistent) summary of BaiReferenceEntry's index data + struct BaiReferenceSummary { + + // data members + int NumBins; + int NumLinearOffsets; + uint64_t FirstBinFilePosition; + uint64_t FirstLinearOffsetFilePosition; + + // ctor + BaiReferenceSummary(void) + : NumBins(0) + , NumLinearOffsets(0) + , FirstBinFilePosition(0) + , FirstLinearOffsetFilePosition(0) + { } + }; + + // convenience typedef for describing a full BAI index file summary + typedef std::vector BaiFileSummary; + + // end BamStandardIndex data structures + // ----------------------------------------------------------------------------- + + class BamStandardIndex : public BamIndex { + + // ctor & dtor + public: + BamStandardIndex(Internal::BamReaderPrivate* reader); + ~BamStandardIndex(void); + + // BamIndex implementation + public: + // builds index from associated BAM file & writes out to index file + bool Create(void); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); ++ BamIndex::IndexType Type(void) const { return BamIndex::STANDARD; } + public: + // returns format's file extension + static const std::string Extension(void); + + // internal methods + private: + + // index file ops + void CheckMagicNumber(void); + void CloseFile(void); + bool IsDeviceOpen(void) const; + void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode); + void Seek(const int64_t& position, const int origin); + int64_t Tell(void) const; + + // BAI index building methods + void ClearReferenceEntry(BaiReferenceEntry& refEntry); + void SaveAlignmentChunkToBin(BaiBinMap& binMap, + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset); + void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset); + + // random-access methods + void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); + void CalculateCandidateBins(const uint32_t& begin, + const uint32_t& end, + std::set& candidateBins); + void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + std::set& candidateBins, + std::vector& offsets); + uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); + + // BAI summary (create/load) methods + void ReserveForSummary(const int& numReferences); + void SaveBinsSummary(const int& refId, const int& numBins); + void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets); + void SkipBins(const int& numBins); + void SkipLinearOffsets(const int& numLinearOffsets); + void SummarizeBins(BaiReferenceSummary& refSummary); + void SummarizeIndexFile(void); + void SummarizeLinearOffsets(BaiReferenceSummary& refSummary); + void SummarizeReference(BaiReferenceSummary& refSummary); + + // BAI full index input methods + void ReadBinID(uint32_t& binId); + void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); + void ReadIntoBuffer(const unsigned int& bytesRequested); + void ReadLinearOffset(uint64_t& linearOffset); + void ReadNumAlignmentChunks(int& numAlignmentChunks); + void ReadNumBins(int& numBins); + void ReadNumLinearOffsets(int& numLinearOffsets); + void ReadNumReferences(int& numReferences); + + // BAI full index output methods + void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks); + void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets); + void WriteAlignmentChunk(const BaiAlignmentChunk& chunk); + void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); + void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); + void WriteBins(const int& refId, BaiBinMap& bins); + void WriteHeader(void); + void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); + void WriteReferenceEntry(BaiReferenceEntry& refEntry); + + // data members + private: + bool m_isBigEndian; + BaiFileSummary m_indexFileSummary; + + // our input buffer + unsigned int m_bufferLength; + struct RaiiWrapper { + IBamIODevice* Device; + char* Buffer; + RaiiWrapper(void); + ~RaiiWrapper(void); + }; + RaiiWrapper m_resources; + + // static methods + private: + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(unsigned char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // static constants + private: + static const int MAX_BIN; + static const int BAM_LIDX_SHIFT; + static const std::string BAI_EXTENSION; + static const char* const BAI_MAGIC; + static const int SIZEOF_ALIGNMENTCHUNK; + static const int SIZEOF_BINCORE; + static const int SIZEOF_LINEAROFFSET; + }; + + } // namespace Internal + } // namespace BamTools + + #endif // BAM_STANDARD_INDEX_FORMAT_H diff --cc src/api/internal/index/BamToolsIndex_p.h index 0000000,7a66f39..c1e1aa0 mode 000000,100644..100644 --- a/src/api/internal/index/BamToolsIndex_p.h +++ b/src/api/internal/index/BamToolsIndex_p.h @@@ -1,0 -1,185 +1,186 @@@ + // *************************************************************************** + // BamToolsIndex.h (c) 2010 Derek Barnett + // Marth Lab, Department of Biology, Boston College + // --------------------------------------------------------------------------- + // Last modified: 10 November 2011 (DB) + // --------------------------------------------------------------------------- + // Provides index operations for the BamTools index format (".bti") + // *************************************************************************** + + #ifndef BAMTOOLS_INDEX_FORMAT_H + #define BAMTOOLS_INDEX_FORMAT_H + + // ------------- + // W A R N I N G + // ------------- + // + // This file is not part of the BamTools API. It exists purely as an + // implementation detail. This header file may change from version to + // version without notice, or even be removed. + // + // We mean it. + + #include "api/BamAux.h" + #include "api/BamIndex.h" + #include "api/IBamIODevice.h" + #include + #include + #include + + namespace BamTools { + namespace Internal { + + // contains data for each 'block' in a BTI index + struct BtiBlock { + + // data members + int32_t MaxEndPosition; + int64_t StartOffset; + int32_t StartPosition; + + // ctor + BtiBlock(const int32_t& maxEndPosition = 0, + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) + { } + }; + + // convenience typedef for describing a a list of BTI blocks on a reference + typedef std::vector BtiBlockVector; + + // contains all fields necessary for building, loading, & writing + // full BTI index data for a single reference + struct BtiReferenceEntry { + + // data members + int32_t ID; + BtiBlockVector Blocks; + + // ctor + BtiReferenceEntry(const int& id = -1) + : ID(id) + { } + }; + + // provides (persistent) summary of BtiReferenceEntry's index data + struct BtiReferenceSummary { + + // data members + int NumBlocks; + uint64_t FirstBlockFilePosition; + + // ctor + BtiReferenceSummary(void) + : NumBlocks(0) + , FirstBlockFilePosition(0) + { } + }; + + // convenience typedef for describing a full BTI index file summary + typedef std::vector BtiFileSummary; + + class BamToolsIndex : public BamIndex { + + // keep a list of any supported versions here + // (might be useful later to handle any 'legacy' versions if the format changes) + // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on + // + // so a change introduced in BTI_1_2 may be handled from then on by: + // + // if ( indexVersion >= BTI_1_2 ) + // do something new + // else + // do the old thing + enum Version { BTI_1_0 = 1 + , BTI_1_1 + , BTI_1_2 + , BTI_2_0 + }; + + // ctor & dtor + public: + BamToolsIndex(Internal::BamReaderPrivate* reader); + ~BamToolsIndex(void); + + // BamIndex implementation + public: + // builds index from associated BAM file & writes out to index file + bool Create(void); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); ++ BamIndex::IndexType Type(void) const { return BamIndex::BAMTOOLS; } + public: + // returns format's file extension + static const std::string Extension(void); + + // internal methods + private: + + // index file ops + void CheckMagicNumber(void); + void CheckVersion(void); + void CloseFile(void); + bool IsDeviceOpen(void) const; + void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode); + void Seek(const int64_t& position, const int origin); + int64_t Tell(void) const; + + // index-creation methods + void ClearReferenceEntry(BtiReferenceEntry& refEntry); + void WriteBlock(const BtiBlock& block); + void WriteBlocks(const BtiBlockVector& blocks); + void WriteHeader(void); + void WriteReferenceEntry(const BtiReferenceEntry& refEntry); + + // random-access methods + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + void ReadBlock(BtiBlock& block); + void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); + void ReadReferenceEntry(BtiReferenceEntry& refEntry); + + // BTI summary data methods + void InitializeFileSummary(const int& numReferences); + void LoadFileSummary(void); + void LoadHeader(void); + void LoadNumBlocks(int& numBlocks); + void LoadNumReferences(int& numReferences); + void LoadReferenceSummary(BtiReferenceSummary& refSummary); + void SkipBlocks(const int& numBlocks); + + // data members + private: + bool m_isBigEndian; + BtiFileSummary m_indexFileSummary; + uint32_t m_blockSize; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; + + struct RaiiWrapper { + IBamIODevice* Device; + RaiiWrapper(void); + ~RaiiWrapper(void); + }; + RaiiWrapper m_resources; + + // static constants + private: + static const uint32_t DEFAULT_BLOCK_LENGTH; + static const std::string BTI_EXTENSION; + static const char* const BTI_MAGIC; + static const int SIZEOF_BLOCK; + }; + + } // namespace Internal + } // namespace BamTools + + #endif // BAMTOOLS_INDEX_FORMAT_H