From: derek Date: Fri, 7 Oct 2011 20:11:43 +0000 (-0400) Subject: Merge with earlier IODevice work X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=8a90b7aefffaf186053ef4da96c8663bf528274a;p=bamtools.git Merge with earlier IODevice work * This commit still has some console pollution. I need to work in the recent Exception/ErrorString approach, but wanted to go ahead and do the merge-conflict resolution now before diving into remote file support. --- 8a90b7aefffaf186053ef4da96c8663bf528274a diff --cc src/api/BamConstants.h index 4a35d8f,ac672a6..f1af90d --- a/src/api/BamConstants.h +++ b/src/api/BamConstants.h @@@ -2,7 -2,7 +2,7 @@@ // BamConstants.h (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- - // Last modified: 5 October 2011 (DB) -// Last modified: 9 September 2011 (DB) ++// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides basic constants for handling BAM files. // *************************************************************************** diff --cc src/api/CMakeLists.txt index 62d2e7f,5a03712..2ec006c --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@@ -25,10 -25,14 +25,15 @@@ set( BamToolsAPISource SamReadGroupDictionary.cpp SamSequence.cpp SamSequenceDictionary.cpp + internal/BamDeviceFactory_p.cpp + internal/BamException_p.cpp + internal/BamFile_p.cpp + internal/BamFtp_p.cpp internal/BamHeader_p.cpp + internal/BamHttp_p.cpp internal/BamIndexFactory_p.cpp internal/BamMultiReader_p.cpp + internal/BamPipe_p.cpp internal/BamRandomAccessController_p.cpp internal/BamReader_p.cpp internal/BamStandardIndex_p.cpp diff --cc src/api/internal/BamHeader_p.cpp index 6ff2f4b,45dc379..7ec1181 --- a/src/api/internal/BamHeader_p.cpp +++ b/src/api/internal/BamHeader_p.cpp @@@ -2,7 -2,7 +2,7 @@@ // BamHeader_p.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- - // Last modified: 6 October 2011 (DB) -// Last modified: 21 March 2011 (DB) ++// Last modified: 7 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for handling BAM headers. // *************************************************************************** diff --cc src/api/internal/BamRandomAccessController_p.cpp index 7f5c350,a44563f..437b609 --- a/src/api/internal/BamRandomAccessController_p.cpp +++ b/src/api/internal/BamRandomAccessController_p.cpp @@@ -2,7 -2,7 +2,7 @@@ // BamRandomAccessController_p.cpp (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- - // Last modified: 6 October 2011(DB) -// Last modified: 5 April 2011(DB) ++// Last modified: 7 October 2011(DB) // --------------------------------------------------------------------------- // Manages random access operations in a BAM file // ************************************************************************** diff --cc src/api/internal/BamReader_p.cpp index 7707017,384e2fe..5d154a1 --- a/src/api/internal/BamReader_p.cpp +++ b/src/api/internal/BamReader_p.cpp @@@ -9,7 -9,8 +9,9 @@@ #include #include + #include + #include +#include #include #include #include @@@ -137,14 -98,29 +139,18 @@@ bool BamReaderPrivate::GetNextAlignment // useful for operations requiring ONLY positional or other alignment-related information bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) { ++ // skip if stream not opened + if ( !m_stream.IsOpen() ) + return false; + - // skip if region is set but has no alignments - if ( m_randomAccessController.HasRegion() && - !m_randomAccessController.RegionHasAlignments() ) - { - return false; - } - - // if can't read next alignment - if ( !LoadNextAlignment(alignment) ) - return false; - - // check alignment's region-overlap state - BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); - - // if alignment starts after region, no need to keep reading - if ( state == BamRandomAccessController::AfterRegion ) - return false; + try { - // read until overlap is found - while ( state != BamRandomAccessController::OverlapsRegion ) { + // skip if region is set but has no alignments + if ( m_randomAccessController.HasRegion() && + !m_randomAccessController.RegionHasAlignments() ) + { + return false; + } // if can't read next alignment if ( !LoadNextAlignment(alignment) ) @@@ -355,38 -305,35 +361,33 @@@ bool BamReaderPrivate::LocateIndex(cons // opens BAM file (and index) bool BamReaderPrivate::Open(const string& filename) { - bool result; - // make sure we're starting with a fresh slate - Close(); -- - // attempt to open BgzfStream for reading - if ( !m_stream.Open(filename, IBamIODevice::ReadOnly) ) { - cerr << "BamReader ERROR: Could not open BGZF stream for " << filename << endl; - return false; - } + try { - // attempt to load header data - if ( !LoadHeaderData() ) { - cerr << "BamReader ERROR: Could not load header data for " << filename << endl; + // make sure we're starting with fresh state Close(); - return false; - } - // attempt to load reference data - if ( !LoadReferenceData() ) { - cerr << "BamReader ERROR: Could not load reference data for " << filename << endl; - Close(); - return false; - } + // open BgzfStream + m_stream.Open(filename, "rb"); + assert(m_stream); - // if all OK, store filename & offset of first alignment - m_filename = filename; - m_alignmentsBeginOffset = m_stream.Tell(); + // load BAM metadata + LoadHeaderData(); + LoadReferenceData(); - // return success - return true; + // store filename & offset of first alignment + m_filename = filename; + m_alignmentsBeginOffset = m_stream.Tell(); + - // set flag - result = true; ++ // return success ++ return true; + + } catch ( BamException& e ) { + const string error = e.what(); + const string message = string("could not open file: ") + filename + + "\n\t" + error; + SetErrorString("BamReader::Open", message); + return false; + } - - // return success/failure - return result; } bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { diff --cc src/api/internal/BamWriter_p.cpp index a8fe370,c9199b9..ce5cfa9 --- a/src/api/internal/BamWriter_p.cpp +++ b/src/api/internal/BamWriter_p.cpp @@@ -9,7 -9,7 +9,11 @@@ #include #include ++<<<<<<< HEAD +#include ++======= + #include ++>>>>>>> iodevice #include using namespace BamTools; using namespace BamTools::Internal; @@@ -141,14 -123,9 +145,14 @@@ void BamWriterPrivate::EncodeQuerySeque } } +// returns a description of the last error that occurred +std::string BamWriterPrivate::GetErrorString(void) const { + return m_errorString; +} + // returns whether BAM file is open for writing or not bool BamWriterPrivate::IsOpen(void) const { - return m_stream.IsOpen; + return m_stream.IsOpen(); } // opens the alignment archive @@@ -156,265 -133,251 +160,271 @@@ bool BamWriterPrivate::Open(const strin const string& samHeaderText, const RefVector& referenceSequences) { ++<<<<<<< HEAD + try { ++======= + // open the BGZF file for writing, return failure if error + if ( !m_stream.Open(filename, IBamIODevice::WriteOnly) ) + return false; ++>>>>>>> iodevice - // write BAM file 'metadata' components - WriteMagicNumber(); - WriteSamHeaderText(samHeaderText); - WriteReferences(referenceSequences); - return true; -} + // open the BGZF file for writing, return failure if error + m_stream.Open(filename, "wb"); -// saves the alignment to the alignment archive -void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); - m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); - - // re-calculate bin (in case BamAlignment's position has been previously modified) - const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); - - // assign the BAM core data - uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( m_isBigEndian ) { - for ( int i = 0; i < 8; ++i ) - BamTools::SwapEndian_32(buffer[i]); - } + // write BAM file 'metadata' components + WriteMagicNumber(); + WriteSamHeaderText(samHeaderText); + WriteReferences(referenceSequences); - // write the BAM core - m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + // return success + return true; - // write the raw char data - m_stream.Write((char*)al.SupportData.AllCharData.data(), - al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; } +} - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) - else { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + - packedCigarLength + - encodedQueryLength + - queryLength + - tagDataLength; - unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; - if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); - m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( m_isBigEndian ) { - for ( int i = 0; i < 8; ++i ) - BamTools::SwapEndian_32(buffer[i]); - } - - // write the BAM core - m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - - // write the query name - m_stream.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( m_isBigEndian ) { - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - if ( m_isBigEndian ) { - for ( unsigned int i = 0; i < packedCigarLength; ++i ) - BamTools::SwapEndian_32p(&cigarData[i]); - } - m_stream.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - m_stream.Write(packedCigar.data(), packedCigarLength); +// saves the alignment to the alignment archive +bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - // write the encoded query sequence - m_stream.Write(encodedQuery.data(), encodedQueryLength); + try { - // write the base qualities - char* pBaseQualities = (char*)al.Qualities.data(); - for ( unsigned int i = 0; i < queryLength; ++i ) - pBaseQualities[i] -= 33; // FASTQ conversion - m_stream.Write(pBaseQualities, queryLength); + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) + WriteCoreAlignment(al); - // write the read group tag - if ( m_isBigEndian ) { + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code) + else WriteAlignment(al); - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); + // if we get here, everything OK + return true; - int i = 0; - while ( (unsigned int)i < tagDataLength ) { + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } +} - i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) - const char type = tagData[i]; // get tag type at position i - ++i; +void BamWriterPrivate::SetWriteCompressed(bool ok) { + // modifying compression is not allowed if BAM file is open + if ( !IsOpen() ) + m_stream.SetWriteCompressed(ok); +} - switch ( type ) { +void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + + packedCigarLength + + encodedQueryLength + + queryLength + + tagDataLength; + unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } - case(Constants::BAM_TAG_TYPE_ASCII) : - case(Constants::BAM_TAG_TYPE_INT8) : - case(Constants::BAM_TAG_TYPE_UINT8) : - ++i; - break; - - case(Constants::BAM_TAG_TYPE_INT16) : - case(Constants::BAM_TAG_TYPE_UINT16) : - BamTools::SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - - case(Constants::BAM_TAG_TYPE_FLOAT) : - case(Constants::BAM_TAG_TYPE_INT32) : - case(Constants::BAM_TAG_TYPE_UINT32) : - BamTools::SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - - case(Constants::BAM_TAG_TYPE_HEX) : - case(Constants::BAM_TAG_TYPE_STRING) : - // no endian swapping necessary for hex-string/string data - while ( tagData[i] ) - ++i; - // increment one more for null terminator - ++i; - break; + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - case(Constants::BAM_TAG_TYPE_ARRAY) : + // write the query name + m_stream.Write(al.Name.c_str(), nameLength); - { - // read array type - const char arrayType = tagData[i]; + // write the packed cigar + if ( m_isBigEndian ) { + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if ( m_isBigEndian ) { + for ( size_t i = 0; i < packedCigarLength; ++i ) + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? + } + else + m_stream.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + m_stream.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + char* pBaseQualities = (char*)al.Qualities.data(); + for ( size_t i = 0; i < queryLength; ++i ) + pBaseQualities[i] -= 33; // FASTQ conversion + m_stream.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( m_isBigEndian ) { + + char* tagData = new char[tagDataLength](); + memcpy(tagData, al.TagData.data(), tagDataLength); + + size_t i = 0; + while ( i < tagDataLength ) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; + + switch ( type ) { + + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : + ++i; + break; + + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data + while ( tagData[i] ) ++i; - - // swap endian-ness of number of elements in place, then retrieve for loop - BamTools::SwapEndian_32p(&tagData[i]); - int32_t numElements; - memcpy(&numElements, &tagData[i], sizeof(uint32_t)); - i += sizeof(uint32_t); - - // swap endian-ness of array elements - for ( int j = 0; j < numElements; ++j ) { - switch (arrayType) { - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - // no endian-swapping necessary - ++i; - break; - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - BamTools::SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - BamTools::SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - default: - // error case - fprintf(stderr, - "BamWriter ERROR: unknown binary array type encountered: [%c]\n", - arrayType); - exit(1); - } + // increment one more for null terminator + ++i; + break; + + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + delete[] tagData; + const string message = string("invalid binary array type: ") + arrayType; + throw BamException("BamWriter::SaveAlignment", message); } - - break; } - default : - fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); + break; } + + default : + delete[] tagData; + const string message = string("invalid tag type: ") + type; + throw BamException("BamWriter::SaveAlignment", message); } - m_stream.Write(tagData, tagDataLength); - free(tagData); } - else - m_stream.Write(al.TagData.data(), tagDataLength); + + m_stream.Write(tagData, tagDataLength); + delete[] tagData; // TODO: cleanup on Write exception thrown? } + else + m_stream.Write(al.TagData.data(), tagDataLength); } -void BamWriterPrivate::SetWriteCompressed(bool ok) { - - // warn if BAM file is already open - // modifying compression is not allowed in this case - if ( IsOpen() ) { - cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. " - << "Ignoring request." << endl; - return; +void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // re-calculate bin (in case BamAlignment's position has been previously modified) + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); } - // set BgzfStream compression mode - m_stream.SetWriteCompressed(ok); + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the raw char data + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); } void BamWriterPrivate::WriteMagicNumber(void) { diff --cc src/api/internal/BgzfStream_p.cpp index 69592d6,3b70749..2053220 --- a/src/api/internal/BgzfStream_p.cpp +++ b/src/api/internal/BgzfStream_p.cpp @@@ -2,78 -2,51 +2,81 @@@ // BgzfStream_p.cpp (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- - // Last modified: 6 October 2011(DB) -// Last modified: 9 September 2011(DB) ++// Last modified: 7 October 2011(DB) // --------------------------------------------------------------------------- // Based on BGZF routines developed at the Broad Institute. // Provides the basic functionality for reading & writing BGZF files // Replaces the old BGZF.* files to avoid clashing with other toolkits // *************************************************************************** + #include +#include #include using namespace BamTools; using namespace BamTools::Internal; #include #include + #include +#include using namespace std; -// constructor -BgzfStream::BgzfStream(void) - : m_uncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE) - , m_compressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE) - , m_blockLength(0) - , m_blockOffset(0) - , m_blockAddress(0) - , m_uncompressedBlock(NULL) - , m_compressedBlock(NULL) - , m_isOpen(false) - , m_isWriteOnly(false) - , m_isWriteCompressed(true) - , m_device(0) - , m_stream(NULL) +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BgzfStream::RaiiWrapper::RaiiWrapper(void) + : Stream(0) { - try { - m_compressedBlock = new char[m_compressedBlockSize]; - m_uncompressedBlock = new char[m_uncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n"); - exit(1); + CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE]; + UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE]; +} + +BgzfStream::RaiiWrapper::~RaiiWrapper(void) { + + // clean up buffers + delete[] CompressedBlock; + delete[] UncompressedBlock; + CompressedBlock = 0; + UncompressedBlock = 0; + + if ( Stream ) { + fflush(Stream); + fclose(Stream); + Stream = 0; } } +// --------------------------- +// BgzfStream implementation +// --------------------------- + +// constructor +BgzfStream::BgzfStream(void) - : BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , IsWriteOnly(false) - , IsWriteCompressed(true) ++ : m_blockLength(0) ++ , m_blockOffset(0) ++ , m_blockAddress(0) ++ , m_isOpen(false) ++ , m_isWriteOnly(false) ++ , m_isWriteCompressed(true) ++ , m_device(0) +{ } + // destructor BgzfStream::~BgzfStream(void) { - if( m_compressedBlock ) delete[] m_compressedBlock; - if( m_uncompressedBlock ) delete[] m_uncompressedBlock; + Close(); +} + +// checks BGZF block header +bool BgzfStream::CheckBlockHeader(char* header) { + return (header[0] == Constants::GZIP_ID1 && + header[1] == Constants::GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & Constants::FLG_FEXTRA) != 0 && + BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && + header[12] == Constants::BGZF_ID1 && + header[13] == Constants::BGZF_ID2 && + BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); } // closes BGZF file @@@ -85,24 -58,20 +88,30 @@@ void BgzfStream::Close(void) // if writing to file, flush the current BGZF block, // then write an empty block (as EOF marker) - if ( IsWriteOnly ) { + if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) { FlushBlock(); - int blockLength = DeflateBlock(); - m_device->Write(m_compressedBlock, blockLength); + const size_t blockLength = DeflateBlock(); - fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream); ++ m_device->Write(Resources.CompressedBlock, blockLength); } - // flush and close stream + // close device + m_device->Close(); - - // clean up & reset flags + delete m_device; + m_device = 0; - m_isWriteCompressed = true; ++ ++ // ?? + fflush(Resources.Stream); + fclose(Resources.Stream); + Resources.Stream = 0; + - // reset initial state - BlockLength = 0; - BlockOffset = 0; - BlockAddress = 0; - IsOpen = false; - IsWriteOnly = false; - IsWriteCompressed = true; ++ // reset state ++ m_blockLength = 0; ++ m_blockOffset = 0; ++ m_blockAddress = 0; + m_isOpen = false; ++ m_isWriteOnly = false; ++ m_isWriteCompressed = true; ++ } // compresses the current block @@@ -122,12 -91,12 +131,12 @@@ size_t BgzfStream::DeflateBlock(void) buffer[14] = Constants::BGZF_LEN; // set compression level - const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 ); + const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 ); // loop to retry for blocks that do not compress enough - int inputLength = m_blockOffset; - unsigned int compressedLength = 0; - unsigned int bufferSize = m_compressedBlockSize; + int inputLength = BlockOffset; + size_t compressedLength = 0; + const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE; while ( true ) { @@@ -198,34 -167,39 +207,38 @@@ BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); // ensure that we have less than a block of data left - int remaining = BlockOffset - inputLength; + int remaining = m_blockOffset - inputLength; if ( remaining > 0 ) { - if ( remaining > inputLength ) { - fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n"); - exit(1); - } - memcpy(m_uncompressedBlock, m_uncompressedBlock + inputLength, remaining); + if ( remaining > inputLength ) + throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large"); + memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining); } - // update block data & return compressedlength - BlockOffset = remaining; + // update block data + m_blockOffset = remaining; + + // return result return compressedLength; } // flushes the data in the BGZF block void BgzfStream::FlushBlock(void) { + BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" ); + // flush all of the remaining blocks - while ( BlockOffset > 0 ) { + while ( m_blockOffset > 0 ) { // compress the data block - unsigned int blockLength = DeflateBlock(); + const size_t blockLength = DeflateBlock(); -- // flush the data to our output stream - const size_t numBytesWritten = fwrite(Resources.CompressedBlock, 1, blockLength, Resources.Stream); - unsigned int numBytesWritten = m_device->Write(m_compressedBlock, blockLength); ++ // flush the data to our output device ++ const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength); if ( numBytesWritten != blockLength ) { - fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n", - blockLength, numBytesWritten); - exit(1); + stringstream s(""); + s << "expected to write " << blockLength + << " bytes during flushing, but wrote " << numBytesWritten; + throw BamException("BgzfStream::FlushBlock", s.str()); } // update block data @@@ -268,21 -242,54 +281,55 @@@ size_t BgzfStream::InflateBlock(const s return zs.total_out; } + bool BgzfStream::IsOpen(void) const { + if ( m_device == 0 ) + return false; + return m_device->IsOpen(); + } + + bool BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) { + + // close current device if necessary + Close(); + + // sanity check + BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" ); + + // retrieve new IO device depending on filename + m_device = BamDeviceFactory::CreateDevice(filename); + + // sanity check + BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" ); + + // if device fails to open + if ( !m_device->Open(mode) ) { + cerr << "BgzfStream::Open() - unable to open IO device:" << endl; + cerr << m_device->ErrorString(); + return false; + } + + // otherwise, set flag & return true + m_isOpen = true; + m_isWriteOnly = ( mode == IBamIODevice::WriteOnly ); + return true; + + } + // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfStream::Open(const string& filename, const char* mode) { +void BgzfStream::Open(const string& filename, const char* mode) { - // close current stream, if necessary, before opening next - if ( m_isOpen ) Close(); + // make sure we're starting with fresh state - if ( IsOpen ) ++ if ( IsOpen() ) + Close(); // determine open mode if ( strcmp(mode, "rb") == 0 ) - IsWriteOnly = false; + m_isWriteOnly = false; else if ( strcmp(mode, "wb") == 0) - IsWriteOnly = true; + m_isWriteOnly = true; else { - fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode); - return false; + const string message = string("unknown file mode: ") + mode; + throw BamException("BgzfStream::Open", message); } // open BGZF stream on a file @@@ -295,23 -302,27 +342,28 @@@ // open BGZF stream on stdout else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) ) - m_stream = freopen(NULL, mode, stdout); + Resources.Stream = freopen(NULL, mode, stdout); - if ( !m_stream ) { - fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() ); - return false; + // ensure valid Stream + if ( !Resources.Stream ) { + const string message = string("unable to open file: ") + filename; + throw BamException("BgzfStream::Open", message); } - // set flag - IsOpen = true; + // set flag & return success + m_isOpen = true; + return true; } // reads BGZF data into a byte buffer -unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) { +size_t BgzfStream::Read(char* data, const size_t dataLength) { - // if stream not open for reading (or empty request) - if ( !IsOpen || IsWriteOnly || dataLength == 0 ) + if ( dataLength == 0 ) + return 0; + + // if stream not open for reading + BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device"); + if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) ) return 0; // read blocks as needed until desired data length is retrieved @@@ -323,95 -335,118 +375,106 @@@ // read (and decompress) next block if needed if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; + ReadBlock(); - bytesAvailable = BlockLength - BlockOffset; + bytesAvailable = m_blockLength - m_blockOffset; - if ( bytesAvailable <= 0 ) break; + if ( bytesAvailable <= 0 ) + break; } // copy data from uncompressed source buffer into data destination buffer - char* buffer = m_uncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + m_blockOffset, copyLength); + const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable ); - memcpy(data, Resources.UncompressedBlock + BlockOffset, copyLength); ++ memcpy(data, Resources.UncompressedBlock + m_blockOffset, copyLength); // update counters - BlockOffset += copyLength; - m_blockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; ++ m_blockOffset += copyLength; + data += copyLength; + numBytesRead += copyLength; } // update block data - if ( BlockOffset == BlockLength ) { - BlockAddress = ftell64(Resources.Stream); - BlockOffset = 0; - BlockLength = 0; + if ( m_blockOffset == m_blockLength ) { + m_blockAddress = m_device->Tell(); - m_blockOffset = 0; - m_blockLength = 0; ++ m_BlockOffset = 0; ++ m_BlockLength = 0; ++ } + // return actual number of bytes read return numBytesRead; } // reads a BGZF block -bool BgzfStream::ReadBlock(void) { +void BgzfStream::ReadBlock(void) { - // store block start - int64_t blockAddress = ftell64(Resources.Stream); + BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device"); + + // store block's starting address + int64_t blockAddress = m_device->Tell(); // read block header from file char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; - size_t count = fread(header, 1, Constants::BGZF_BLOCK_HEADER_LENGTH, Resources.Stream); - int numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH); ++ size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH); - // if block header empty, set marker & skip rest of method - if ( count == 0 ) { - BlockLength = 0; - return; + // if block header empty + if ( numBytesRead == 0 ) { + m_blockLength = 0; + return true; } // if block header invalid size - if ( count != sizeof(header) ) - if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n"); - return false; - } ++ if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) + throw BamException("BgzfStream::ReadBlock", "invalid block header size"); // validate block header contents - if ( !BgzfStream::CheckBlockHeader(header) ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n"); - return false; - } + if ( !BgzfStream::CheckBlockHeader(header) ) + throw BamException("BgzfStream::ReadBlock", "invalid block header contents"); // copy header contents to compressed buffer - int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = m_compressedBlock; - memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); - int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; + memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); // read remainder of block - numBytesRead = m_device->Read(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining); - if ( numBytesRead != remaining ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n"); - return false; - } + const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; - count = fread(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Resources.Stream); - if ( count != remaining ) ++ numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining); ++ if ( numBytesRead != remaining ) + throw BamException("BgzfStream::ReadBlock", "could not read data from block"); // decompress block data - count = InflateBlock(blockLength); + numBytesRead = InflateBlock(blockLength); - if ( numBytesRead < 0 ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n"); - return false; - } - // update block metadata - if ( BlockLength != 0 ) - BlockOffset = 0; - BlockAddress = blockAddress; - BlockLength = count; + // update block data + if ( m_blockLength != 0 ) + m_blockOffset = 0; + m_blockAddress = blockAddress; + m_blockLength = numBytesRead; - - // return success - return true; } // seek to position in BGZF file -bool BgzfStream::Seek(const int64_t& position) { +void BgzfStream::Seek(const int64_t& position) { + BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device"); + + // skip if not open or not seek-able + if ( !IsOpen() /*|| !m_device->IsRandomAccess()*/ ) { + cerr << "BgzfStream::Seek() - device not open" << endl; + return false; + } + // determine adjusted offset & address int blockOffset = (position & 0xFFFF); int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; // attempt seek in file - if ( fseek64(Resources.Stream, blockAddress, SEEK_SET) != 0 ) { + if ( !m_device->Seek(blockAddress) ) { - cerr << "BgzfStream ERROR: unable to seek in file" << endl; - return false; + stringstream s(""); + s << "unable to seek to position: " << position; + throw BamException("BgzfStream::Seek", s.str()); } - // if successful, update block metadata - BlockLength = 0; - BlockAddress = blockAddress; - BlockOffset = blockOffset; + // update block data & return success + m_blockLength = 0; + m_blockAddress = blockAddress; + m_blockOffset = blockOffset; - return true; } void BgzfStream::SetWriteCompressed(bool ok) { @@@ -426,25 -460,25 +488,29 @@@ int64_t BgzfStream::Tell(void) const } // writes the supplied data into the BGZF buffer -unsigned int BgzfStream::Write(const char* data, const unsigned int dataLength) { +size_t BgzfStream::Write(const char* data, const size_t dataLength) { + BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device"); + BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly), + "BgzfStream::Write() - trying to write to non-writable IO device"); + + // skip if file not open for writing + if ( !IsOpen || !IsWriteOnly ) + return false; + // write blocks as needed til all data is written - unsigned int numBytesWritten = 0; + size_t numBytesWritten = 0; const char* input = data; - unsigned int blockLength = m_uncompressedBlockSize; + const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE; while ( numBytesWritten < dataLength ) { // copy data contents to uncompressed output buffer - const size_t copyLength = min(blockLength - BlockOffset, dataLength - numBytesWritten); + unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten); - char* buffer = m_uncompressedBlock; + char* buffer = Resources.UncompressedBlock; - memcpy(buffer + BlockOffset, input, copyLength); + memcpy(buffer + m_blockOffset, input, copyLength); - // update counters - BlockOffset += copyLength; + // update counter + m_blockOffset += copyLength; input += copyLength; numBytesWritten += copyLength; diff --cc src/api/internal/BgzfStream_p.h index 7ebd647,f7ea268..07aae52 --- a/src/api/internal/BgzfStream_p.h +++ b/src/api/internal/BgzfStream_p.h @@@ -2,7 -2,7 +2,7 @@@ // BgzfStream_p.h (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- --// Last modified: 5 April 2011(DB) ++// Last modified: 7 October 2011(DB) // --------------------------------------------------------------------------- // Based on BGZF routines developed at the Broad Institute. // Provides the basic functionality for reading & writing BGZF files @@@ -24,9 -24,9 +24,10 @@@ #include #include + #include #include "zlib.h" #include +#include #include namespace BamTools { @@@ -43,12 -43,16 +44,17 @@@ class BgzfStream public: // closes BGZF file void Close(void); ++ // returns true if BgzfStream open for IO + bool IsOpen(void) const; - // opens the BGZF stream in requested mode - bool Open(const std::string& filename, const char* mode); - bool Open(const std::string& filename, const IBamIODevice::OpenMode mode); + // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) + void Open(const std::string& filename, const char* mode); ++ void Open(const std::string& filename, const IBamIODevice::OpenMode mode); // reads BGZF data into a byte buffer - unsigned int Read(char* data, const unsigned int dataLength); + size_t Read(char* data, const size_t dataLength); // seek to position in BGZF file - bool Seek(const int64_t& position); + void Seek(const int64_t& position); + // sets IO device (closes previous, if any, but does not attempt to open) + void SetIODevice(IBamIODevice* device); // enable/disable compressed output void SetWriteCompressed(bool ok); // get file position in BGZF file @@@ -74,23 -78,38 +80,25 @@@ // data members public: - unsigned int BlockLength; - unsigned int BlockOffset; - int64_t BlockAddress; - bool IsOpen; - bool IsWriteOnly; - bool IsWriteCompressed; - unsigned int m_uncompressedBlockSize; - unsigned int m_compressedBlockSize; + unsigned int m_blockLength; + unsigned int m_blockOffset; + uint64_t m_blockAddress; + - char* m_uncompressedBlock; - char* m_compressedBlock; - + bool m_isOpen; + bool m_isWriteOnly; + bool m_isWriteCompressed; + + IBamIODevice* m_device; - FILE* m_stream; -}; -// ------------------------------------------------------------- -// static 'utility' method implementations - -// checks BGZF block header -inline -bool BgzfStream::CheckBlockHeader(char* header) { - return (header[0] == Constants::GZIP_ID1 && - header[1] == (char)Constants::GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & Constants::FLG_FEXTRA) != 0 && - BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && - header[12] == Constants::BGZF_ID1 && - header[13] == Constants::BGZF_ID2 && - BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); -} + struct RaiiWrapper { + RaiiWrapper(void); + ~RaiiWrapper(void); + char* UncompressedBlock; + char* CompressedBlock; + FILE* Stream; + }; + RaiiWrapper Resources; - +}; } // namespace Internal } // namespace BamTools