--- /dev/null
+// ***************************************************************************
+// BamHeader_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 25 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for handling BAM headers.
+// ***************************************************************************
+
+#include <api/BamAux.h>
+#include <api/BamConstants.h>
+#include <api/BGZF.h>
+#include <api/internal/BamHeader_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+using namespace std;
+
+// ---------------------------------
+// BamHeaderPrivate implementation
+
+struct BamHeader::BamHeaderPrivate {
+
+ // data members
+ SamHeader* m_samHeader;
+
+ // ctor
+ BamHeaderPrivate(void)
+ : m_samHeader(0)
+ { }
+
+ // 'public' interface
+ bool Load(BgzfData* stream);
+
+ // internal methods
+ bool CheckMagicNumber(BgzfData* stream);
+ bool ReadHeaderLength(BgzfData* stream, uint32_t& length);
+ bool ReadHeaderText(BgzfData* stream, const uint32_t& length);
+};
+
+bool BamHeader::BamHeaderPrivate::Load(BgzfData* stream) {
+
+ // cannot load if invalid stream
+ if ( stream == 0 )
+ return false;
+
+ // cannot load if magic number is invalid
+ if ( !CheckMagicNumber(stream) )
+ return false;
+
+ // cannot load header if cannot read header length
+ uint32_t length(0);
+ if ( !ReadHeaderLength(stream, length) )
+ return false;
+
+ // cannot load header if cannot read header text
+ if ( !ReadHeaderText(stream, length) )
+ return false;
+
+ // otherwise, everything OK
+ return true;
+}
+
+bool BamHeader::BamHeaderPrivate::CheckMagicNumber(BgzfData* stream) {
+
+ // try to read magic number
+ char buffer[Constants::BAM_HEADER_MAGIC_SIZE];
+ if ( stream->Read(buffer, Constants::BAM_HEADER_MAGIC_SIZE) != (int)Constants::BAM_HEADER_MAGIC_SIZE ) {
+ fprintf(stderr, "BAM header error - could not read magic number\n");
+ return false;
+ }
+
+ // validate magic number
+ if ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_SIZE) != 0 ) {
+ fprintf(stderr, "BAM header error - invalid magic number\n");
+ return false;
+ }
+
+ // all checks out
+ return true;
+}
+
+bool BamHeader::BamHeaderPrivate::ReadHeaderLength(BgzfData* stream, uint32_t& length) {
+
+ // attempt to read BAM header text length
+ char buffer[sizeof(uint32_t)];
+ if ( stream->Read(buffer, sizeof(uint32_t)) != sizeof(uint32_t) ) {
+ fprintf(stderr, "BAM header error - could not read header length\n");
+ return false;
+ }
+
+ // convert char buffer to length, return success
+ length = BgzfData::UnpackUnsignedInt(buffer);
+ if ( BamTools::SystemIsBigEndian() )
+ SwapEndian_32(length);
+ return true;
+}
+
+bool BamHeader::BamHeaderPrivate::ReadHeaderText(BgzfData* stream, const uint32_t& length) {
+
+ // set up destination buffer
+ char* headerText = (char*)calloc(length + 1, 1);
+
+ // attempt to read header text
+ const unsigned bytesRead = stream->Read(headerText, length);
+ const bool readOk = ( bytesRead == length );
+ if ( readOk )
+ m_samHeader = new SamHeader( (string)((const char*)headerText) );
+ else
+ fprintf(stderr, "BAM header error - could not read header text\n");
+
+ // clean up calloc-ed temp variable (on success or fail)
+ free(headerText);
+
+ // return read success
+ return readOk;
+}
+
+// --------------------------
+// BamHeader implementation
+
+BamHeader::BamHeader(void)
+ : d(new BamHeaderPrivate)
+{ }
+
+BamHeader::~BamHeader(void) {
+ delete d;
+ d = 0;
+}
+
+void BamHeader::Clear(void) {
+ delete d->m_samHeader;
+ d->m_samHeader = new SamHeader("");
+}
+
+bool BamHeader::IsValid(void) const {
+ return d->m_samHeader->IsValid();
+}
+
+bool BamHeader::Load(BgzfData* stream) {
+ return d->Load(stream);
+}
+
+SamHeader BamHeader::ToSamHeader(void) const {
+ return *(d->m_samHeader);
+}
+
+string BamHeader::ToString(void) const {
+ return d->m_samHeader->ToString();
+}
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 22 November 2010 (DB)
+// Last modified: 11 January 2011 (DB)
// ---------------------------------------------------------------------------
// Provides the basic functionality for producing BAM files
// ***************************************************************************
using namespace BamTools::Internal;
using namespace std;
+// ctor
BamWriterPrivate::BamWriterPrivate(void) {
IsBigEndian = SystemIsBigEndian();
}
+// dtor
BamWriterPrivate::~BamWriterPrivate(void) {
mBGZF.Close();
}
-// closes the alignment archive
-void BamWriterPrivate::Close(void) {
- mBGZF.Close();
-}
-
// calculates minimum bin for a BAM alignment interval
const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
--end;
return 0;
}
+// closes the alignment archive
+void BamWriterPrivate::Close(void) {
+ mBGZF.Close();
+}
+
// creates a cigar string from the supplied alignment
void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
vector<CigarOp>::const_iterator coIter;
for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
- switch(coIter->Type) {
- case 'M':
- cigarOp = BAM_CMATCH;
- break;
- case 'I':
- cigarOp = BAM_CINS;
- break;
- case 'D':
- cigarOp = BAM_CDEL;
- break;
- case 'N':
- cigarOp = BAM_CREF_SKIP;
- break;
- case 'S':
- cigarOp = BAM_CSOFT_CLIP;
- break;
- case 'H':
- cigarOp = BAM_CHARD_CLIP;
- break;
- case 'P':
- cigarOp = BAM_CPAD;
- break;
- default:
- fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
- exit(1);
- }
-
- *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
- pPackedCigar++;
+ switch(coIter->Type) {
+ case 'M': cigarOp = BAM_CMATCH; break;
+ case 'I': cigarOp = BAM_CINS; break;
+ case 'D': cigarOp = BAM_CDEL; break;
+ case 'N': cigarOp = BAM_CREF_SKIP; break;
+ case 'S': cigarOp = BAM_CSOFT_CLIP; break;
+ case 'H': cigarOp = BAM_CHARD_CLIP; break;
+ case 'P': cigarOp = BAM_CPAD; break;
+ default:
+ fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+ exit(1);
+ }
+
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+ pPackedCigar++;
}
}
while(*pQuery) {
- switch(*pQuery) {
-
- case '=':
- nucleotideCode = 0;
- break;
-
- case 'A':
- nucleotideCode = 1;
- break;
-
- case 'C':
- nucleotideCode = 2;
- break;
-
- case 'G':
- nucleotideCode = 4;
- break;
-
- case 'T':
- nucleotideCode = 8;
- break;
-
- case 'N':
- nucleotideCode = 15;
- break;
-
- default:
- fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
- exit(1);
- }
-
- // pack the nucleotide code
- if(useHighWord) {
- *pEncodedQuery = nucleotideCode << 4;
- useHighWord = false;
- } else {
- *pEncodedQuery |= nucleotideCode;
- pEncodedQuery++;
- useHighWord = true;
- }
-
- // increment the query position
- pQuery++;
+ switch(*pQuery) {
+ case '=': nucleotideCode = 0; break;
+ case 'A': nucleotideCode = 1; break;
+ case 'C': nucleotideCode = 2; break;
+ case 'G': nucleotideCode = 4; break;
+ case 'T': nucleotideCode = 8; break;
+ case 'N': nucleotideCode = 15; break;
+ default:
+ fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+ exit(1);
+ }
+
+ // pack the nucleotide code
+ if(useHighWord) {
+ *pEncodedQuery = nucleotideCode << 4;
+ useHighWord = false;
+ } else {
+ *pEncodedQuery |= nucleotideCode;
+ pEncodedQuery++;
+ useHighWord = true;
+ }
+
+ // increment the query position
+ pQuery++;
}
}
{
// open the BGZF file for writing, return failure if error
if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
- return false;
+ return false;
// ================
// write the header
// write the sequence dictionary
// =============================
- RefVector::const_iterator rsIter;
- for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+ RefVector::const_iterator rsIter = referenceSequences.begin();
+ RefVector::const_iterator rsEnd = referenceSequences.end();
+ for( ; rsIter != rsEnd; ++rsIter ) {
- // write the reference sequence name length
- uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
- if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
- mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
+ // write the reference sequence name length
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
+ if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
+ mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
- // write the reference sequence name
- mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
+ // write the reference sequence name
+ mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
- // write the reference sequence length
- int32_t referenceLength = rsIter->RefLength;
- if (IsBigEndian) SwapEndian_32(referenceLength);
- mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
+ // write the reference sequence length
+ int32_t referenceLength = rsIter->RefLength;
+ if (IsBigEndian) SwapEndian_32(referenceLength);
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
}
// return success
// (as a result of BamReader::GetNextAlignmentCore())
if ( al.SupportData.HasCoreOnly ) {
- // write the block size
- unsigned int blockSize = al.SupportData.BlockLength;
- if (IsBigEndian) SwapEndian_32(blockSize);
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
-
- // assign the BAM core data
- uint32_t buffer[8];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
- buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
- buffer[4] = al.SupportData.QuerySequenceLength;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // swap BAM core endian-ness, if necessary
- if ( IsBigEndian ) {
- for ( int i = 0; i < 8; ++i )
- SwapEndian_32(buffer[i]);
- }
-
- // write the BAM core
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
-
- // write the raw char data
- mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
+ // write the block size
+ unsigned int blockSize = al.SupportData.BlockLength;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+ buffer[4] = al.SupportData.QuerySequenceLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the raw char data
+ mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
}
// otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
// ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
else {
- // calculate char lengths
- const unsigned int nameLength = al.Name.size() + 1;
- const unsigned int numCigarOperations = al.CigarData.size();
- const unsigned int queryLength = al.QueryBases.size();
- const unsigned int tagDataLength = al.TagData.size();
-
- // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
- // force calculation of Bin before storing
- const int endPosition = al.GetEndPosition();
- const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
-
- // create our packed cigar string
- string packedCigar;
- CreatePackedCigar(al.CigarData, packedCigar);
- const unsigned int packedCigarLength = packedCigar.size();
-
- // encode the query
- string encodedQuery;
- EncodeQuerySequence(al.QueryBases, encodedQuery);
- const unsigned int encodedQueryLength = encodedQuery.size();
-
- // write the block size
- const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
- unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
- if (IsBigEndian) SwapEndian_32(blockSize);
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
-
- // assign the BAM core data
- uint32_t buffer[8];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
- buffer[4] = queryLength;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // swap BAM core endian-ness, if necessary
- if ( IsBigEndian ) {
- for ( int i = 0; i < 8; ++i )
- SwapEndian_32(buffer[i]);
- }
-
- // write the BAM core
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
-
- // write the query name
- mBGZF.Write(al.Name.c_str(), nameLength);
-
- // write the packed cigar
- if ( IsBigEndian ) {
-
- char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
- memcpy(cigarData, packedCigar.data(), packedCigarLength);
-
- for (unsigned int i = 0; i < packedCigarLength; ++i) {
- if ( IsBigEndian )
- SwapEndian_32p(&cigarData[i]);
- }
-
- mBGZF.Write(cigarData, packedCigarLength);
- free(cigarData);
- }
- else
- mBGZF.Write(packedCigar.data(), packedCigarLength);
-
- // write the encoded query sequence
- mBGZF.Write(encodedQuery.data(), encodedQueryLength);
-
- // write the base qualities
- string baseQualities(al.Qualities);
- char* pBaseQualities = (char*)al.Qualities.data();
- for(unsigned int i = 0; i < queryLength; i++) {
- pBaseQualities[i] -= 33;
- }
- mBGZF.Write(pBaseQualities, queryLength);
-
- // write the read group tag
- if ( IsBigEndian ) {
-
- char* tagData = (char*)calloc(sizeof(char), tagDataLength);
- memcpy(tagData, al.TagData.data(), tagDataLength);
-
- int i = 0;
- while ( (unsigned int)i < tagDataLength ) {
-
- i += 2; // skip tag type (e.g. "RG", "NM", etc)
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
- ++i; // skip value type
-
- switch (type) {
-
- case('A') :
- case('C') :
- ++i;
- break;
-
- case('S') :
- SwapEndian_16p(&tagData[i]);
- i+=2; // sizeof(uint16_t)
- break;
-
- case('F') :
- case('I') :
- SwapEndian_32p(&tagData[i]);
- i+=4; // sizeof(uint32_t)
- break;
-
- case('D') :
- SwapEndian_64p(&tagData[i]);
- i+=8; // sizeof(uint64_t)
- break;
-
- case('H') :
- case('Z') :
- while (tagData[i]) { ++i; }
- ++i; // increment one more for null terminator
- break;
-
- default :
- fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
- free(tagData);
- exit(1);
- }
- }
-
- mBGZF.Write(tagData, tagDataLength);
- free(tagData);
- }
- else
- mBGZF.Write(al.TagData.data(), tagDataLength);
+ // calculate char lengths
+ const unsigned int nameLength = al.Name.size() + 1;
+ const unsigned int numCigarOperations = al.CigarData.size();
+ const unsigned int queryLength = al.QueryBases.size();
+ const unsigned int tagDataLength = al.TagData.size();
+
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
+ // force calculation of Bin before storing
+ const int endPosition = al.GetEndPosition();
+ const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+
+ // create our packed cigar string
+ string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLength = packedCigar.size();
+
+ // encode the query
+ string encodedQuery;
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ const unsigned int encodedQueryLength = encodedQuery.size();
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLength +
+ packedCigarLength +
+ encodedQueryLength +
+ queryLength +
+ tagDataLength;
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+ buffer[4] = queryLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the query name
+ mBGZF.Write(al.Name.c_str(), nameLength);
+
+ // write the packed cigar
+ if ( IsBigEndian ) {
+
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+
+ for (unsigned int i = 0; i < packedCigarLength; ++i) {
+ if ( IsBigEndian )
+ SwapEndian_32p(&cigarData[i]);
+ }
+
+ mBGZF.Write(cigarData, packedCigarLength);
+ free(cigarData);
+ }
+ else
+ mBGZF.Write(packedCigar.data(), packedCigarLength);
+
+ // write the encoded query sequence
+ mBGZF.Write(encodedQuery.data(), encodedQueryLength);
+
+ // write the base qualities
+ char* pBaseQualities = (char*)al.Qualities.data();
+ for(unsigned int i = 0; i < queryLength; i++)
+ pBaseQualities[i] -= 33; // FASTQ conversion
+ mBGZF.Write(pBaseQualities, queryLength);
+
+ // write the read group tag
+ if ( IsBigEndian ) {
+
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);
+ memcpy(tagData, al.TagData.data(), tagDataLength);
+
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i+=2; // sizeof(uint16_t)
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i+=4; // sizeof(uint32_t)
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i+=8; // sizeof(uint64_t)
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ free(tagData);
+ exit(1);
+ }
+ }
+
+ mBGZF.Write(tagData, tagDataLength);
+ free(tagData);
+ }
+ else
+ mBGZF.Write(al.TagData.data(), tagDataLength);
}
}