// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 30 March 2010 (DB)\r
+// Last modified: 15 July 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
// Provides the basic functionality for producing BAM files\r
// ***************************************************************************\r
\r
-// BGZF includes\r
+#include <iostream>\r
+\r
#include "BGZF.h"\r
#include "BamWriter.h"\r
using namespace BamTools;\r
BgzfData mBGZF;\r
bool IsBigEndian;\r
\r
- \r
// constructor / destructor\r
BamWriterPrivate(void) { \r
IsBigEndian = SystemIsBigEndian(); \r
void SaveAlignment(const BamAlignment& al);\r
\r
// internal methods\r
+ const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
void EncodeQuerySequence(const string& query, string& encodedQuery);\r
};\r
mBGZF.Close();\r
}\r
\r
+// calculates minimum bin for a BAM alignment interval\r
+const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { \r
+ --end;\r
+ if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
+ if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);\r
+ if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);\r
+ if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);\r
+ if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);\r
+ return 0;\r
+}\r
+\r
// creates a cigar string from the supplied alignment\r
void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
\r
\r
unsigned int cigarOp;\r
vector<CigarOp>::const_iterator coIter;\r
- for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {\r
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
\r
switch(coIter->Type) {\r
case 'M':\r
\r
// write the SAM header text length\r
uint32_t samHeaderLen = samHeader.size();\r
- if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }\r
+ if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
\r
// write the SAM header text\r
- if(samHeaderLen > 0) {\r
+ if(samHeaderLen > 0) \r
mBGZF.Write(samHeader.data(), samHeaderLen);\r
- }\r
\r
// write the number of reference sequences\r
uint32_t numReferenceSequences = referenceSequences.size();\r
- if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }\r
+ if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
\r
// =============================\r
\r
// write the reference sequence name length\r
uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
- if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }\r
+ if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
\r
// write the reference sequence name\r
\r
// write the reference sequence length\r
int32_t referenceLength = rsIter->RefLength;\r
- if ( IsBigEndian ) { SwapEndian_32(referenceLength); }\r
+ if (IsBigEndian) SwapEndian_32(referenceLength);\r
mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
}\r
}\r
// saves the alignment to the alignment archive\r
void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
\r
- // assign the BAM core data\r
- uint32_t buffer[8];\r
- buffer[0] = al.RefID;\r
- buffer[1] = al.Position;\r
- buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
- buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
- buffer[4] = al.SupportData.QuerySequenceLength;\r
- buffer[5] = al.MateRefID;\r
- buffer[6] = al.MatePosition;\r
- buffer[7] = al.InsertSize;\r
-\r
- // write the block size\r
- unsigned int blockSize = al.SupportData.BlockLength;\r
- if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
- // swap BAM core endian-ness, if necessary\r
- if ( IsBigEndian ) { \r
- for ( int i = 0; i < 8; ++i ) { \r
- SwapEndian_32(buffer[i]); \r
- } \r
+ // if BamAlignment contains only the core data and a raw char data buffer\r
+ // (as a result of BamReader::GetNextAlignmentCore())\r
+ if ( al.SupportData.HasCoreOnly ) {\r
+ \r
+ // write the block size\r
+ unsigned int blockSize = al.SupportData.BlockLength;\r
+ if (IsBigEndian) SwapEndian_32(blockSize);\r
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+ // assign the BAM core data\r
+ uint32_t buffer[8];\r
+ buffer[0] = al.RefID;\r
+ buffer[1] = al.Position;\r
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
+ buffer[4] = al.SupportData.QuerySequenceLength;\r
+ buffer[5] = al.MateRefID;\r
+ buffer[6] = al.MatePosition;\r
+ buffer[7] = al.InsertSize;\r
+ \r
+ // swap BAM core endian-ness, if necessary\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i )\r
+ SwapEndian_32(buffer[i]); \r
+ }\r
+ \r
+ // write the BAM core\r
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+ \r
+ // write the raw char data\r
+ mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
}\r
\r
- // write the BAM core\r
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
-\r
- // if support data, not parsed out (resulted from BamReader::GetNextAlignmentCore()\r
- // write the raw char data\r
- if ( !al.SupportData.IsParsed )\r
- mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
- \r
- // re-pack (as needed) & write the parsed char data\r
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
+ // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
else {\r
- \r
- // initialize\r
- const unsigned int nameLen = al.Name.size() + 1;\r
- const unsigned int queryLen = al.QueryBases.size();\r
+ \r
+ // calculate char lengths\r
+ const unsigned int nameLength = al.Name.size() + 1;\r
+ const unsigned int numCigarOperations = al.CigarData.size();\r
+ const unsigned int queryLength = al.QueryBases.size();\r
const unsigned int tagDataLength = al.TagData.size();\r
\r
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
+ // force calculation of Bin before storing\r
+ const int endPosition = al.GetEndPosition();\r
+ const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
+ \r
// create our packed cigar string\r
string packedCigar;\r
CreatePackedCigar(al.CigarData, packedCigar);\r
- const unsigned int packedCigarLen = packedCigar.size();\r
+ const unsigned int packedCigarLength = packedCigar.size();\r
\r
// encode the query\r
string encodedQuery;\r
EncodeQuerySequence(al.QueryBases, encodedQuery);\r
- const unsigned int encodedQueryLen = encodedQuery.size(); \r
+ const unsigned int encodedQueryLength = encodedQuery.size(); \r
\r
+ // write the block size\r
+ const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ if (IsBigEndian) SwapEndian_32(blockSize);\r
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+ // assign the BAM core data\r
+ uint32_t buffer[8];\r
+ buffer[0] = al.RefID;\r
+ buffer[1] = al.Position;\r
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
+ buffer[4] = queryLength;\r
+ buffer[5] = al.MateRefID;\r
+ buffer[6] = al.MatePosition;\r
+ buffer[7] = al.InsertSize;\r
+ \r
+ // swap BAM core endian-ness, if necessary\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i )\r
+ SwapEndian_32(buffer[i]); \r
+ }\r
+ \r
+ // write the BAM core\r
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+ \r
// write the query name\r
- mBGZF.Write(al.Name.c_str(), nameLen);\r
+ mBGZF.Write(al.Name.c_str(), nameLength);\r
\r
// write the packed cigar\r
if ( IsBigEndian ) {\r
\r
- char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
- memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
\r
- for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
- if ( IsBigEndian ) { \r
+ for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
+ if ( IsBigEndian )\r
SwapEndian_32p(&cigarData[i]); \r
- }\r
}\r
\r
- mBGZF.Write(cigarData, packedCigarLen);\r
- free(cigarData);\r
- \r
- } else { \r
- mBGZF.Write(packedCigar.data(), packedCigarLen);\r
- }\r
+ mBGZF.Write(cigarData, packedCigarLength);\r
+ free(cigarData); \r
+ } \r
+ else \r
+ mBGZF.Write(packedCigar.data(), packedCigarLength);\r
\r
// write the encoded query sequence\r
- mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
+ mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
\r
// write the base qualities\r
- string baseQualities = al.Qualities;\r
+ string baseQualities(al.Qualities);\r
char* pBaseQualities = (char*)al.Qualities.data();\r
- for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; }\r
- mBGZF.Write(pBaseQualities, queryLen);\r
+ for(unsigned int i = 0; i < queryLength; i++) { \r
+ pBaseQualities[i] -= 33; \r
+ }\r
+ mBGZF.Write(pBaseQualities, queryLength);\r
\r
// write the read group tag\r
if ( IsBigEndian ) {\r
\r
mBGZF.Write(tagData, tagDataLength);\r
free(tagData);\r
- } else {\r
- mBGZF.Write(al.TagData.data(), tagDataLength);\r
- } \r
+ } \r
+ else \r
+ mBGZF.Write(al.TagData.data(), tagDataLength); \r
}\r
}\r