// BamConstants.h (c) 2011 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
- // Last modified: 5 October 2011 (DB)
-// Last modified: 9 September 2011 (DB)
++// Last modified: 7 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides basic constants for handling BAM files.
// ***************************************************************************
SamReadGroupDictionary.cpp
SamSequence.cpp
SamSequenceDictionary.cpp
+ internal/BamDeviceFactory_p.cpp
+ internal/BamException_p.cpp
+ internal/BamFile_p.cpp
+ internal/BamFtp_p.cpp
internal/BamHeader_p.cpp
+ internal/BamHttp_p.cpp
internal/BamIndexFactory_p.cpp
internal/BamMultiReader_p.cpp
+ internal/BamPipe_p.cpp
internal/BamRandomAccessController_p.cpp
internal/BamReader_p.cpp
internal/BamStandardIndex_p.cpp
// BamHeader_p.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
- // Last modified: 6 October 2011 (DB)
-// Last modified: 21 March 2011 (DB)
++// Last modified: 7 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides the basic functionality for handling BAM headers.
// ***************************************************************************
// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
- // Last modified: 6 October 2011(DB)
-// Last modified: 5 April 2011(DB)
++// Last modified: 7 October 2011(DB)
// ---------------------------------------------------------------------------
// Manages random access operations in a BAM file
// **************************************************************************
#include <api/BamConstants.h>
#include <api/BamReader.h>
+ #include <api/IBamIODevice.h>
+ #include <api/internal/BamDeviceFactory_p.h>
+#include <api/internal/BamException_p.h>
#include <api/internal/BamHeader_p.h>
#include <api/internal/BamRandomAccessController_p.h>
#include <api/internal/BamReader_p.h>
// useful for operations requiring ONLY positional or other alignment-related information
bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) {
- // skip if region is set but has no alignments
- if ( m_randomAccessController.HasRegion() &&
- !m_randomAccessController.RegionHasAlignments() )
- {
- return false;
- }
-
- // if can't read next alignment
- if ( !LoadNextAlignment(alignment) )
- return false;
-
- // check alignment's region-overlap state
- BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment);
-
- // if alignment starts after region, no need to keep reading
- if ( state == BamRandomAccessController::AfterRegion )
- return false;
++ // skip if stream not opened
+ if ( !m_stream.IsOpen() )
+ return false;
+
+ try {
- // read until overlap is found
- while ( state != BamRandomAccessController::OverlapsRegion ) {
+ // skip if region is set but has no alignments
+ if ( m_randomAccessController.HasRegion() &&
+ !m_randomAccessController.RegionHasAlignments() )
+ {
+ return false;
+ }
// if can't read next alignment
if ( !LoadNextAlignment(alignment) )
// opens BAM file (and index)
bool BamReaderPrivate::Open(const string& filename) {
- bool result;
- // make sure we're starting with a fresh slate
- Close();
--
- // attempt to open BgzfStream for reading
- if ( !m_stream.Open(filename, IBamIODevice::ReadOnly) ) {
- cerr << "BamReader ERROR: Could not open BGZF stream for " << filename << endl;
- return false;
- }
+ try {
- // attempt to load header data
- if ( !LoadHeaderData() ) {
- cerr << "BamReader ERROR: Could not load header data for " << filename << endl;
+ // make sure we're starting with fresh state
Close();
- return false;
- }
- // attempt to load reference data
- if ( !LoadReferenceData() ) {
- cerr << "BamReader ERROR: Could not load reference data for " << filename << endl;
- Close();
- return false;
- }
+ // open BgzfStream
+ m_stream.Open(filename, "rb");
+ assert(m_stream);
- // if all OK, store filename & offset of first alignment
- m_filename = filename;
- m_alignmentsBeginOffset = m_stream.Tell();
+ // load BAM metadata
+ LoadHeaderData();
+ LoadReferenceData();
- // return success
- return true;
+ // store filename & offset of first alignment
+ m_filename = filename;
+ m_alignmentsBeginOffset = m_stream.Tell();
+
- // set flag
- result = true;
++ // return success
++ return true;
+
+ } catch ( BamException& e ) {
+ const string error = e.what();
+ const string message = string("could not open file: ") + filename +
+ "\n\t" + error;
+ SetErrorString("BamReader::Open", message);
+ return false;
+ }
-
- // return success/failure
- return result;
}
bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
#include <api/BamAlignment.h>
#include <api/BamConstants.h>
++<<<<<<< HEAD
+#include <api/internal/BamException_p.h>
++=======
+ #include <api/IBamIODevice.h>
++>>>>>>> iodevice
#include <api/internal/BamWriter_p.h>
using namespace BamTools;
using namespace BamTools::Internal;
}
}
+// returns a description of the last error that occurred
+std::string BamWriterPrivate::GetErrorString(void) const {
+ return m_errorString;
+}
+
// returns whether BAM file is open for writing or not
bool BamWriterPrivate::IsOpen(void) const {
- return m_stream.IsOpen;
+ return m_stream.IsOpen();
}
// opens the alignment archive
const string& samHeaderText,
const RefVector& referenceSequences)
{
++<<<<<<< HEAD
+ try {
++=======
+ // open the BGZF file for writing, return failure if error
+ if ( !m_stream.Open(filename, IBamIODevice::WriteOnly) )
+ return false;
++>>>>>>> iodevice
- // write BAM file 'metadata' components
- WriteMagicNumber();
- WriteSamHeaderText(samHeaderText);
- WriteReferences(referenceSequences);
- return true;
-}
+ // open the BGZF file for writing, return failure if error
+ m_stream.Open(filename, "wb");
-// saves the alignment to the alignment archive
-void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
-
- // if BamAlignment contains only the core data and a raw char data buffer
- // (as a result of BamReader::GetNextAlignmentCore())
- if ( al.SupportData.HasCoreOnly ) {
-
- // write the block size
- unsigned int blockSize = al.SupportData.BlockLength;
- if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
- m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
-
- // re-calculate bin (in case BamAlignment's position has been previously modified)
- const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
-
- // assign the BAM core data
- uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
- buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
- buffer[4] = al.SupportData.QuerySequenceLength;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // swap BAM core endian-ness, if necessary
- if ( m_isBigEndian ) {
- for ( int i = 0; i < 8; ++i )
- BamTools::SwapEndian_32(buffer[i]);
- }
+ // write BAM file 'metadata' components
+ WriteMagicNumber();
+ WriteSamHeaderText(samHeaderText);
+ WriteReferences(referenceSequences);
- // write the BAM core
- m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
+ // return success
+ return true;
- // write the raw char data
- m_stream.Write((char*)al.SupportData.AllCharData.data(),
- al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
}
+}
- // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
- // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
- else {
-
- // calculate char lengths
- const unsigned int nameLength = al.Name.size() + 1;
- const unsigned int numCigarOperations = al.CigarData.size();
- const unsigned int queryLength = al.QueryBases.size();
- const unsigned int tagDataLength = al.TagData.size();
-
- // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
- // force calculation of Bin before storing
- const int endPosition = al.GetEndPosition();
- const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
-
- // create our packed cigar string
- string packedCigar;
- CreatePackedCigar(al.CigarData, packedCigar);
- const unsigned int packedCigarLength = packedCigar.size();
-
- // encode the query
- string encodedQuery;
- EncodeQuerySequence(al.QueryBases, encodedQuery);
- const unsigned int encodedQueryLength = encodedQuery.size();
-
- // write the block size
- const unsigned int dataBlockSize = nameLength +
- packedCigarLength +
- encodedQueryLength +
- queryLength +
- tagDataLength;
- unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
- if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
- m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
-
- // assign the BAM core data
- uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
- buffer[4] = queryLength;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // swap BAM core endian-ness, if necessary
- if ( m_isBigEndian ) {
- for ( int i = 0; i < 8; ++i )
- BamTools::SwapEndian_32(buffer[i]);
- }
-
- // write the BAM core
- m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
-
- // write the query name
- m_stream.Write(al.Name.c_str(), nameLength);
-
- // write the packed cigar
- if ( m_isBigEndian ) {
- char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
- memcpy(cigarData, packedCigar.data(), packedCigarLength);
- if ( m_isBigEndian ) {
- for ( unsigned int i = 0; i < packedCigarLength; ++i )
- BamTools::SwapEndian_32p(&cigarData[i]);
- }
- m_stream.Write(cigarData, packedCigarLength);
- free(cigarData);
- }
- else
- m_stream.Write(packedCigar.data(), packedCigarLength);
+// saves the alignment to the alignment archive
+bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
- // write the encoded query sequence
- m_stream.Write(encodedQuery.data(), encodedQueryLength);
+ try {
- // write the base qualities
- char* pBaseQualities = (char*)al.Qualities.data();
- for ( unsigned int i = 0; i < queryLength; ++i )
- pBaseQualities[i] -= 33; // FASTQ conversion
- m_stream.Write(pBaseQualities, queryLength);
+ // if BamAlignment contains only the core data and a raw char data buffer
+ // (as a result of BamReader::GetNextAlignmentCore())
+ if ( al.SupportData.HasCoreOnly )
+ WriteCoreAlignment(al);
- // write the read group tag
- if ( m_isBigEndian ) {
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
+ // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code)
+ else WriteAlignment(al);
- char* tagData = (char*)calloc(sizeof(char), tagDataLength);
- memcpy(tagData, al.TagData.data(), tagDataLength);
+ // if we get here, everything OK
+ return true;
- int i = 0;
- while ( (unsigned int)i < tagDataLength ) {
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+}
- i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
- const char type = tagData[i]; // get tag type at position i
- ++i;
+void BamWriterPrivate::SetWriteCompressed(bool ok) {
+ // modifying compression is not allowed if BAM file is open
+ if ( !IsOpen() )
+ m_stream.SetWriteCompressed(ok);
+}
- switch ( type ) {
+void BamWriterPrivate::WriteAlignment(const BamAlignment& al) {
+
+ // calculate char lengths
+ const unsigned int nameLength = al.Name.size() + 1;
+ const unsigned int numCigarOperations = al.CigarData.size();
+ const unsigned int queryLength = al.QueryBases.size();
+ const unsigned int tagDataLength = al.TagData.size();
+
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
+ // force calculation of Bin before storing
+ const int endPosition = al.GetEndPosition();
+ const uint32_t alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+
+ // create our packed cigar string
+ string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLength = packedCigar.size();
+
+ // encode the query
+ string encodedQuery;
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ const unsigned int encodedQueryLength = encodedQuery.size();
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLength +
+ packedCigarLength +
+ encodedQueryLength +
+ queryLength +
+ tagDataLength;
+ unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
+ m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+ buffer[4] = queryLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( m_isBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ BamTools::SwapEndian_32(buffer[i]);
+ }
- case(Constants::BAM_TAG_TYPE_ASCII) :
- case(Constants::BAM_TAG_TYPE_INT8) :
- case(Constants::BAM_TAG_TYPE_UINT8) :
- ++i;
- break;
-
- case(Constants::BAM_TAG_TYPE_INT16) :
- case(Constants::BAM_TAG_TYPE_UINT16) :
- BamTools::SwapEndian_16p(&tagData[i]);
- i += sizeof(uint16_t);
- break;
-
- case(Constants::BAM_TAG_TYPE_FLOAT) :
- case(Constants::BAM_TAG_TYPE_INT32) :
- case(Constants::BAM_TAG_TYPE_UINT32) :
- BamTools::SwapEndian_32p(&tagData[i]);
- i += sizeof(uint32_t);
- break;
-
- case(Constants::BAM_TAG_TYPE_HEX) :
- case(Constants::BAM_TAG_TYPE_STRING) :
- // no endian swapping necessary for hex-string/string data
- while ( tagData[i] )
- ++i;
- // increment one more for null terminator
- ++i;
- break;
+ // write the BAM core
+ m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
- case(Constants::BAM_TAG_TYPE_ARRAY) :
+ // write the query name
+ m_stream.Write(al.Name.c_str(), nameLength);
- {
- // read array type
- const char arrayType = tagData[i];
+ // write the packed cigar
+ if ( m_isBigEndian ) {
+ char* cigarData = new char[packedCigarLength]();
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+ if ( m_isBigEndian ) {
+ for ( size_t i = 0; i < packedCigarLength; ++i )
+ BamTools::SwapEndian_32p(&cigarData[i]);
+ }
+ m_stream.Write(cigarData, packedCigarLength);
+ delete[] cigarData; // TODO: cleanup on Write exception thrown?
+ }
+ else
+ m_stream.Write(packedCigar.data(), packedCigarLength);
+
+ // write the encoded query sequence
+ m_stream.Write(encodedQuery.data(), encodedQueryLength);
+
+ // write the base qualities
+ char* pBaseQualities = (char*)al.Qualities.data();
+ for ( size_t i = 0; i < queryLength; ++i )
+ pBaseQualities[i] -= 33; // FASTQ conversion
+ m_stream.Write(pBaseQualities, queryLength);
+
+ // write the read group tag
+ if ( m_isBigEndian ) {
+
+ char* tagData = new char[tagDataLength]();
+ memcpy(tagData, al.TagData.data(), tagDataLength);
+
+ size_t i = 0;
+ while ( i < tagDataLength ) {
+
+ i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
+ const char type = tagData[i]; // get tag type at position i
+ ++i;
+
+ switch ( type ) {
+
+ case(Constants::BAM_TAG_TYPE_ASCII) :
+ case(Constants::BAM_TAG_TYPE_INT8) :
+ case(Constants::BAM_TAG_TYPE_UINT8) :
+ ++i;
+ break;
+
+ case(Constants::BAM_TAG_TYPE_INT16) :
+ case(Constants::BAM_TAG_TYPE_UINT16) :
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case(Constants::BAM_TAG_TYPE_FLOAT) :
+ case(Constants::BAM_TAG_TYPE_INT32) :
+ case(Constants::BAM_TAG_TYPE_UINT32) :
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case(Constants::BAM_TAG_TYPE_HEX) :
+ case(Constants::BAM_TAG_TYPE_STRING) :
+ // no endian swapping necessary for hex-string/string data
+ while ( tagData[i] )
++i;
-
- // swap endian-ness of number of elements in place, then retrieve for loop
- BamTools::SwapEndian_32p(&tagData[i]);
- int32_t numElements;
- memcpy(&numElements, &tagData[i], sizeof(uint32_t));
- i += sizeof(uint32_t);
-
- // swap endian-ness of array elements
- for ( int j = 0; j < numElements; ++j ) {
- switch (arrayType) {
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- // no endian-swapping necessary
- ++i;
- break;
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- BamTools::SwapEndian_16p(&tagData[i]);
- i += sizeof(uint16_t);
- break;
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- BamTools::SwapEndian_32p(&tagData[i]);
- i += sizeof(uint32_t);
- break;
- default:
- // error case
- fprintf(stderr,
- "BamWriter ERROR: unknown binary array type encountered: [%c]\n",
- arrayType);
- exit(1);
- }
+ // increment one more for null terminator
+ ++i;
+ break;
+
+ case(Constants::BAM_TAG_TYPE_ARRAY) :
+
+ {
+ // read array type
+ const char arrayType = tagData[i];
+ ++i;
+
+ // swap endian-ness of number of elements in place, then retrieve for loop
+ BamTools::SwapEndian_32p(&tagData[i]);
+ int32_t numElements;
+ memcpy(&numElements, &tagData[i], sizeof(uint32_t));
+ i += sizeof(uint32_t);
+
+ // swap endian-ness of array elements
+ for ( int j = 0; j < numElements; ++j ) {
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ // no endian-swapping necessary
+ ++i;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+ default:
+ delete[] tagData;
+ const string message = string("invalid binary array type: ") + arrayType;
+ throw BamException("BamWriter::SaveAlignment", message);
}
-
- break;
}
- default :
- fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here
- free(tagData);
- exit(1);
+ break;
}
+
+ default :
+ delete[] tagData;
+ const string message = string("invalid tag type: ") + type;
+ throw BamException("BamWriter::SaveAlignment", message);
}
- m_stream.Write(tagData, tagDataLength);
- free(tagData);
}
- else
- m_stream.Write(al.TagData.data(), tagDataLength);
+
+ m_stream.Write(tagData, tagDataLength);
+ delete[] tagData; // TODO: cleanup on Write exception thrown?
}
+ else
+ m_stream.Write(al.TagData.data(), tagDataLength);
}
-void BamWriterPrivate::SetWriteCompressed(bool ok) {
-
- // warn if BAM file is already open
- // modifying compression is not allowed in this case
- if ( IsOpen() ) {
- cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. "
- << "Ignoring request." << endl;
- return;
+void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) {
+
+ // write the block size
+ unsigned int blockSize = al.SupportData.BlockLength;
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
+ m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
+
+ // re-calculate bin (in case BamAlignment's position has been previously modified)
+ const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
+
+ // assign the BAM core data
+ uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+ buffer[4] = al.SupportData.QuerySequenceLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( m_isBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ BamTools::SwapEndian_32(buffer[i]);
}
- // set BgzfStream compression mode
- m_stream.SetWriteCompressed(ok);
+ // write the BAM core
+ m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
+
+ // write the raw char data
+ m_stream.Write((char*)al.SupportData.AllCharData.data(),
+ al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
}
void BamWriterPrivate::WriteMagicNumber(void) {
// BgzfStream_p.cpp (c) 2011 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
- // Last modified: 6 October 2011(DB)
-// Last modified: 9 September 2011(DB)
++// Last modified: 7 October 2011(DB)
// ---------------------------------------------------------------------------
// Based on BGZF routines developed at the Broad Institute.
// Provides the basic functionality for reading & writing BGZF files
// Replaces the old BGZF.* files to avoid clashing with other toolkits
// ***************************************************************************
+ #include <api/internal/BamDeviceFactory_p.h>
+#include <api/internal/BamException_p.h>
#include <api/internal/BgzfStream_p.h>
using namespace BamTools;
using namespace BamTools::Internal;
#include <cstring>
#include <algorithm>
+ #include <iostream>
+#include <sstream>
using namespace std;
-// constructor
-BgzfStream::BgzfStream(void)
- : m_uncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE)
- , m_compressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE)
- , m_blockLength(0)
- , m_blockOffset(0)
- , m_blockAddress(0)
- , m_uncompressedBlock(NULL)
- , m_compressedBlock(NULL)
- , m_isOpen(false)
- , m_isWriteOnly(false)
- , m_isWriteCompressed(true)
- , m_device(0)
- , m_stream(NULL)
+// ----------------------------
+// RaiiWrapper implementation
+// ----------------------------
+
+BgzfStream::RaiiWrapper::RaiiWrapper(void)
+ : Stream(0)
{
- try {
- m_compressedBlock = new char[m_compressedBlockSize];
- m_uncompressedBlock = new char[m_uncompressedBlockSize];
- } catch( std::bad_alloc& ba ) {
- fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n");
- exit(1);
+ CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE];
+ UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE];
+}
+
+BgzfStream::RaiiWrapper::~RaiiWrapper(void) {
+
+ // clean up buffers
+ delete[] CompressedBlock;
+ delete[] UncompressedBlock;
+ CompressedBlock = 0;
+ UncompressedBlock = 0;
+
+ if ( Stream ) {
+ fflush(Stream);
+ fclose(Stream);
+ Stream = 0;
}
}
- : BlockLength(0)
- , BlockOffset(0)
- , BlockAddress(0)
- , IsOpen(false)
- , IsWriteOnly(false)
- , IsWriteCompressed(true)
+// ---------------------------
+// BgzfStream implementation
+// ---------------------------
+
+// constructor
+BgzfStream::BgzfStream(void)
++ : m_blockLength(0)
++ , m_blockOffset(0)
++ , m_blockAddress(0)
++ , m_isOpen(false)
++ , m_isWriteOnly(false)
++ , m_isWriteCompressed(true)
++ , m_device(0)
+{ }
+
// destructor
BgzfStream::~BgzfStream(void) {
- if( m_compressedBlock ) delete[] m_compressedBlock;
- if( m_uncompressedBlock ) delete[] m_uncompressedBlock;
+ Close();
+}
+
+// checks BGZF block header
+bool BgzfStream::CheckBlockHeader(char* header) {
+ return (header[0] == Constants::GZIP_ID1 &&
+ header[1] == Constants::GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & Constants::FLG_FEXTRA) != 0 &&
+ BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
+ header[12] == Constants::BGZF_ID1 &&
+ header[13] == Constants::BGZF_ID2 &&
+ BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
}
// closes BGZF file
// if writing to file, flush the current BGZF block,
// then write an empty block (as EOF marker)
- if ( IsWriteOnly ) {
+ if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) {
FlushBlock();
- int blockLength = DeflateBlock();
- m_device->Write(m_compressedBlock, blockLength);
+ const size_t blockLength = DeflateBlock();
- fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream);
++ m_device->Write(Resources.CompressedBlock, blockLength);
}
- // flush and close stream
+ // close device
+ m_device->Close();
-
- // clean up & reset flags
+ delete m_device;
+ m_device = 0;
- m_isWriteCompressed = true;
++
++ // ??
+ fflush(Resources.Stream);
+ fclose(Resources.Stream);
+ Resources.Stream = 0;
+
- // reset initial state
- BlockLength = 0;
- BlockOffset = 0;
- BlockAddress = 0;
- IsOpen = false;
- IsWriteOnly = false;
- IsWriteCompressed = true;
++ // reset state
++ m_blockLength = 0;
++ m_blockOffset = 0;
++ m_blockAddress = 0;
+ m_isOpen = false;
++ m_isWriteOnly = false;
++ m_isWriteCompressed = true;
++
}
// compresses the current block
buffer[14] = Constants::BGZF_LEN;
// set compression level
- const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
+ const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
// loop to retry for blocks that do not compress enough
- int inputLength = m_blockOffset;
- unsigned int compressedLength = 0;
- unsigned int bufferSize = m_compressedBlockSize;
+ int inputLength = BlockOffset;
+ size_t compressedLength = 0;
+ const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
while ( true ) {
BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
// ensure that we have less than a block of data left
- int remaining = BlockOffset - inputLength;
+ int remaining = m_blockOffset - inputLength;
if ( remaining > 0 ) {
- if ( remaining > inputLength ) {
- fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n");
- exit(1);
- }
- memcpy(m_uncompressedBlock, m_uncompressedBlock + inputLength, remaining);
+ if ( remaining > inputLength )
+ throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
+ memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining);
}
- // update block data & return compressedlength
- BlockOffset = remaining;
+ // update block data
+ m_blockOffset = remaining;
+
+ // return result
return compressedLength;
}
// flushes the data in the BGZF block
void BgzfStream::FlushBlock(void) {
+ BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" );
+
// flush all of the remaining blocks
- while ( BlockOffset > 0 ) {
+ while ( m_blockOffset > 0 ) {
// compress the data block
- unsigned int blockLength = DeflateBlock();
+ const size_t blockLength = DeflateBlock();
-- // flush the data to our output stream
- const size_t numBytesWritten = fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream);
- unsigned int numBytesWritten = m_device->Write(m_compressedBlock, blockLength);
++ // flush the data to our output device
++ const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength);
if ( numBytesWritten != blockLength ) {
- fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n",
- blockLength, numBytesWritten);
- exit(1);
+ stringstream s("");
+ s << "expected to write " << blockLength
+ << " bytes during flushing, but wrote " << numBytesWritten;
+ throw BamException("BgzfStream::FlushBlock", s.str());
}
// update block data
return zs.total_out;
}
+ bool BgzfStream::IsOpen(void) const {
+ if ( m_device == 0 )
+ return false;
+ return m_device->IsOpen();
+ }
+
+ bool BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) {
+
+ // close current device if necessary
+ Close();
+
+ // sanity check
+ BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" );
+
+ // retrieve new IO device depending on filename
+ m_device = BamDeviceFactory::CreateDevice(filename);
+
+ // sanity check
+ BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" );
+
+ // if device fails to open
+ if ( !m_device->Open(mode) ) {
+ cerr << "BgzfStream::Open() - unable to open IO device:" << endl;
+ cerr << m_device->ErrorString();
+ return false;
+ }
+
+ // otherwise, set flag & return true
+ m_isOpen = true;
+ m_isWriteOnly = ( mode == IBamIODevice::WriteOnly );
+ return true;
+
+ }
+
// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
-bool BgzfStream::Open(const string& filename, const char* mode) {
+void BgzfStream::Open(const string& filename, const char* mode) {
- // close current stream, if necessary, before opening next
- if ( m_isOpen ) Close();
+ // make sure we're starting with fresh state
- if ( IsOpen )
++ if ( IsOpen() )
+ Close();
// determine open mode
if ( strcmp(mode, "rb") == 0 )
- IsWriteOnly = false;
+ m_isWriteOnly = false;
else if ( strcmp(mode, "wb") == 0)
- IsWriteOnly = true;
+ m_isWriteOnly = true;
else {
- fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode);
- return false;
+ const string message = string("unknown file mode: ") + mode;
+ throw BamException("BgzfStream::Open", message);
}
// open BGZF stream on a file
// open BGZF stream on stdout
else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) )
- m_stream = freopen(NULL, mode, stdout);
+ Resources.Stream = freopen(NULL, mode, stdout);
- if ( !m_stream ) {
- fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() );
- return false;
+ // ensure valid Stream
+ if ( !Resources.Stream ) {
+ const string message = string("unable to open file: ") + filename;
+ throw BamException("BgzfStream::Open", message);
}
- // set flag
- IsOpen = true;
+ // set flag & return success
+ m_isOpen = true;
+ return true;
}
// reads BGZF data into a byte buffer
-unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) {
+size_t BgzfStream::Read(char* data, const size_t dataLength) {
- // if stream not open for reading (or empty request)
- if ( !IsOpen || IsWriteOnly || dataLength == 0 )
+ if ( dataLength == 0 )
+ return 0;
+
+ // if stream not open for reading
+ BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device");
+ if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) )
return 0;
// read blocks as needed until desired data length is retrieved
// read (and decompress) next block if needed
if ( bytesAvailable <= 0 ) {
- if ( !ReadBlock() ) return -1;
+ ReadBlock();
- bytesAvailable = BlockLength - BlockOffset;
+ bytesAvailable = m_blockLength - m_blockOffset;
- if ( bytesAvailable <= 0 ) break;
+ if ( bytesAvailable <= 0 )
+ break;
}
// copy data from uncompressed source buffer into data destination buffer
- char* buffer = m_uncompressedBlock;
- int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
- memcpy(output, buffer + m_blockOffset, copyLength);
+ const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable );
- memcpy(data, Resources.UncompressedBlock + BlockOffset, copyLength);
++ memcpy(data, Resources.UncompressedBlock + m_blockOffset, copyLength);
// update counters
- BlockOffset += copyLength;
- m_blockOffset += copyLength;
- output += copyLength;
- numBytesRead += copyLength;
++ m_blockOffset += copyLength;
+ data += copyLength;
+ numBytesRead += copyLength;
}
// update block data
- if ( BlockOffset == BlockLength ) {
- BlockAddress = ftell64(Resources.Stream);
- BlockOffset = 0;
- BlockLength = 0;
+ if ( m_blockOffset == m_blockLength ) {
+ m_blockAddress = m_device->Tell();
- m_blockOffset = 0;
- m_blockLength = 0;
++ m_BlockOffset = 0;
++ m_BlockLength = 0;
++
}
+ // return actual number of bytes read
return numBytesRead;
}
// reads a BGZF block
-bool BgzfStream::ReadBlock(void) {
+void BgzfStream::ReadBlock(void) {
- // store block start
- int64_t blockAddress = ftell64(Resources.Stream);
+ BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
+
+ // store block's starting address
+ int64_t blockAddress = m_device->Tell();
// read block header from file
char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
- size_t count = fread(header, 1, Constants::BGZF_BLOCK_HEADER_LENGTH, Resources.Stream);
- int numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
++ size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
- // if block header empty, set marker & skip rest of method
- if ( count == 0 ) {
- BlockLength = 0;
- return;
+ // if block header empty
+ if ( numBytesRead == 0 ) {
+ m_blockLength = 0;
+ return true;
}
// if block header invalid size
- if ( count != sizeof(header) )
- if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) {
- fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n");
- return false;
- }
++ if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH )
+ throw BamException("BgzfStream::ReadBlock", "invalid block header size");
// validate block header contents
- if ( !BgzfStream::CheckBlockHeader(header) ) {
- fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n");
- return false;
- }
+ if ( !BgzfStream::CheckBlockHeader(header) )
+ throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
// copy header contents to compressed buffer
- int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
- char* compressedBlock = m_compressedBlock;
- memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
- int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
+ const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
+ memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
// read remainder of block
- numBytesRead = m_device->Read(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
- if ( numBytesRead != remaining ) {
- fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n");
- return false;
- }
+ const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
- count = fread(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Resources.Stream);
- if ( count != remaining )
++ numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
++ if ( numBytesRead != remaining )
+ throw BamException("BgzfStream::ReadBlock", "could not read data from block");
// decompress block data
- count = InflateBlock(blockLength);
+ numBytesRead = InflateBlock(blockLength);
- if ( numBytesRead < 0 ) {
- fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n");
- return false;
- }
- // update block metadata
- if ( BlockLength != 0 )
- BlockOffset = 0;
- BlockAddress = blockAddress;
- BlockLength = count;
+ // update block data
+ if ( m_blockLength != 0 )
+ m_blockOffset = 0;
+ m_blockAddress = blockAddress;
+ m_blockLength = numBytesRead;
-
- // return success
- return true;
}
// seek to position in BGZF file
-bool BgzfStream::Seek(const int64_t& position) {
+void BgzfStream::Seek(const int64_t& position) {
+ BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device");
+
+ // skip if not open or not seek-able
+ if ( !IsOpen() /*|| !m_device->IsRandomAccess()*/ ) {
+ cerr << "BgzfStream::Seek() - device not open" << endl;
+ return false;
+ }
+
// determine adjusted offset & address
int blockOffset = (position & 0xFFFF);
int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
// attempt seek in file
- if ( fseek64(Resources.Stream, blockAddress, SEEK_SET) != 0 ) {
+ if ( !m_device->Seek(blockAddress) ) {
- cerr << "BgzfStream ERROR: unable to seek in file" << endl;
- return false;
+ stringstream s("");
+ s << "unable to seek to position: " << position;
+ throw BamException("BgzfStream::Seek", s.str());
}
- // if successful, update block metadata
- BlockLength = 0;
- BlockAddress = blockAddress;
- BlockOffset = blockOffset;
+ // update block data & return success
+ m_blockLength = 0;
+ m_blockAddress = blockAddress;
+ m_blockOffset = blockOffset;
- return true;
}
void BgzfStream::SetWriteCompressed(bool ok) {
}
// writes the supplied data into the BGZF buffer
-unsigned int BgzfStream::Write(const char* data, const unsigned int dataLength) {
+size_t BgzfStream::Write(const char* data, const size_t dataLength) {
+ BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device");
+ BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly),
+ "BgzfStream::Write() - trying to write to non-writable IO device");
+
+ // skip if file not open for writing
+ if ( !IsOpen || !IsWriteOnly )
+ return false;
+
// write blocks as needed til all data is written
- unsigned int numBytesWritten = 0;
+ size_t numBytesWritten = 0;
const char* input = data;
- unsigned int blockLength = m_uncompressedBlockSize;
+ const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
while ( numBytesWritten < dataLength ) {
// copy data contents to uncompressed output buffer
- const size_t copyLength = min(blockLength - BlockOffset, dataLength - numBytesWritten);
+ unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten);
- char* buffer = m_uncompressedBlock;
+ char* buffer = Resources.UncompressedBlock;
- memcpy(buffer + BlockOffset, input, copyLength);
+ memcpy(buffer + m_blockOffset, input, copyLength);
- // update counters
- BlockOffset += copyLength;
+ // update counter
+ m_blockOffset += copyLength;
input += copyLength;
numBytesWritten += copyLength;
// BgzfStream_p.h (c) 2011 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
--// Last modified: 5 April 2011(DB)
++// Last modified: 7 October 2011(DB)
// ---------------------------------------------------------------------------
// Based on BGZF routines developed at the Broad Institute.
// Provides the basic functionality for reading & writing BGZF files
#include <api/BamAux.h>
#include <api/BamConstants.h>
+ #include <api/IBamIODevice.h>
#include "zlib.h"
#include <cstdio>
+#include <memory>
#include <string>
namespace BamTools {
public:
// closes BGZF file
void Close(void);
- // opens the BGZF stream in requested mode
- bool Open(const std::string& filename, const char* mode);
- bool Open(const std::string& filename, const IBamIODevice::OpenMode mode);
++ // returns true if BgzfStream open for IO
+ bool IsOpen(void) const;
+ // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing)
+ void Open(const std::string& filename, const char* mode);
++ void Open(const std::string& filename, const IBamIODevice::OpenMode mode);
// reads BGZF data into a byte buffer
- unsigned int Read(char* data, const unsigned int dataLength);
+ size_t Read(char* data, const size_t dataLength);
// seek to position in BGZF file
- bool Seek(const int64_t& position);
+ void Seek(const int64_t& position);
+ // sets IO device (closes previous, if any, but does not attempt to open)
+ void SetIODevice(IBamIODevice* device);
// enable/disable compressed output
void SetWriteCompressed(bool ok);
// get file position in BGZF file
// data members
public:
- unsigned int BlockLength;
- unsigned int BlockOffset;
- int64_t BlockAddress;
- bool IsOpen;
- bool IsWriteOnly;
- bool IsWriteCompressed;
- unsigned int m_uncompressedBlockSize;
- unsigned int m_compressedBlockSize;
+ unsigned int m_blockLength;
+ unsigned int m_blockOffset;
+ uint64_t m_blockAddress;
+
- char* m_uncompressedBlock;
- char* m_compressedBlock;
-
+ bool m_isOpen;
+ bool m_isWriteOnly;
+ bool m_isWriteCompressed;
+
+ IBamIODevice* m_device;
- FILE* m_stream;
-};
-// -------------------------------------------------------------
-// static 'utility' method implementations
-
-// checks BGZF block header
-inline
-bool BgzfStream::CheckBlockHeader(char* header) {
- return (header[0] == Constants::GZIP_ID1 &&
- header[1] == (char)Constants::GZIP_ID2 &&
- header[2] == Z_DEFLATED &&
- (header[3] & Constants::FLG_FEXTRA) != 0 &&
- BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
- header[12] == Constants::BGZF_ID1 &&
- header[13] == Constants::BGZF_ID2 &&
- BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
-}
+ struct RaiiWrapper {
+ RaiiWrapper(void);
+ ~RaiiWrapper(void);
+ char* UncompressedBlock;
+ char* CompressedBlock;
+ FILE* Stream;
+ };
+ RaiiWrapper Resources;
-
+};
} // namespace Internal
} // namespace BamTools