--- /dev/null
+// ***************************************************************************
+// BamAlignment.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 18 September 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <map>
+#include <utility>
+#include "BamAlignment.h"
+using namespace BamTools;
+
+// default ctor
+BamAlignment::BamAlignment(void)
+ : RefID(-1)
+ , Position(-1)
+ , MateRefID(-1)
+ , MatePosition(-1)
+ , InsertSize(0)
+{ }
+
+// copy ctor
+BamAlignment::BamAlignment(const BamAlignment& other)
+ : Name(other.Name)
+ , Length(other.Length)
+ , QueryBases(other.QueryBases)
+ , AlignedBases(other.AlignedBases)
+ , Qualities(other.Qualities)
+ , TagData(other.TagData)
+ , RefID(other.RefID)
+ , Position(other.Position)
+ , Bin(other.Bin)
+ , MapQuality(other.MapQuality)
+ , AlignmentFlag(other.AlignmentFlag)
+ , CigarData(other.CigarData)
+ , MateRefID(other.MateRefID)
+ , MatePosition(other.MatePosition)
+ , InsertSize(other.InsertSize)
+ , SupportData(other.SupportData)
+{ }
+
+// dtor
+BamAlignment::~BamAlignment(void) { }
+
+// Queries against alignment flags
+bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
+bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
+bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
+bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
+bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
+bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
+bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
+bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
+bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
+
+// Manipulate alignment flags
+void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
+void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
+void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
+void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
+void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
+void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
+void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
+void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
+void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
+void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
+void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
+
+// calculates alignment end position, based on starting position and CIGAR operations
+int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
+
+ // initialize alignment end to starting position
+ int alignEnd = Position;
+
+ // iterate over cigar operations
+ std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {
+ const char cigarType = (*cigarIter).Type;
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' )
+ alignEnd += (*cigarIter).Length;
+ else if ( usePadded && cigarType == 'I' )
+ alignEnd += (*cigarIter).Length;
+ }
+
+ // adjust for zeroBased, if necessary
+ if (zeroBased)
+ return alignEnd - 1;
+ else
+ return alignEnd;
+}
+
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, copy tag data to temp buffer
+ std::string newTag = tag + type + value;
+ const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ std::string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {
+ return AddTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ std::string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + value.size()];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ const unsigned int dataLength = strlen(value.c_str());
+ memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {
+ return EditTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+// get "NM" tag data - originally contributed by Aaron Quinlan
+// stores data in 'editDistance', returns success/fail
+bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
+ return GetTag("NM", (uint32_t&)editDistance);
+}
+
+// get "RG" tag data
+// stores data in 'readGroup', returns success/fail
+bool BamAlignment::GetReadGroup(std::string& readGroup) const {
+ return GetTag("RG", readGroup);
+}
+
+bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+ const unsigned int dataLength = strlen(pTagData);
+ destination.clear();
+ destination.resize(dataLength);
+ memcpy( (char*)destination.data(), pTagData, dataLength );
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch (type) {
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type for integer destination (float or var-length strings)
+ case 'f':
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {
+ return GetTag(tag, (uint32_t&)destination);
+}
+
+bool BamAlignment::GetTag(const std::string& tag, float& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+ //pTagData += numBytesParsed;
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch(type) {
+
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'f':
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type (var-length strings)
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0.0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::RemoveTag(const std::string& tag) {
+
+ // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed
+ // also, return false if no data present to remove
+ if ( SupportData.HasCoreOnly || TagData.empty() ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ char newTagData[originalTagDataLength];
+
+ // copy original tag data up til desired tag
+ pTagData -= 3;
+ numBytesParsed -= 3;
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
+
+ // save new tag data
+ TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, no removal - return failure
+ return false;
+}
+
+bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {
+
+ while ( numBytesParsed < tagDataLength ) {
+
+ const char* pTagType = pTagData;
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+
+ // check the current tag, return true on match
+ if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 )
+ return true;
+
+ // get the storage class and find the next tag
+ if ( *pTagStorageType == '\0' ) return false;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
+ if ( *pTagData == '\0' ) return false;
+ }
+
+ // checked all tags, none match
+ return false;
+}
+
+bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
+
+ switch(storageType) {
+
+ case 'A':
+ case 'c':
+ case 'C':
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case 's':
+ case 'S':
+ numBytesParsed += 2;
+ pTagData += 2;
+ break;
+
+ case 'f':
+ case 'i':
+ case 'I':
+ numBytesParsed += 4;
+ pTagData += 4;
+ break;
+
+ case 'Z':
+ case 'H':
+ while(*pTagData) {
+ ++numBytesParsed;
+ ++pTagData;
+ }
+ // increment for null-terminator
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ default:
+ // error case
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);
+ return false;
+ }
+
+ // return success
+ return true;
+}
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// BamAlignment.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 18 September 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#ifndef BAMALIGNMENT_H
+#define BAMALIGNMENT_H
+
+#include <string>
+#include <vector>
+#include "BamAux.h"
+
+namespace BamTools {
+
+// BamAlignment data structure
+// explicitly labeled as 'struct' to indicate that (most of) its fields are public
+struct BamAlignment {
+
+ // constructors & destructor
+ public:
+ BamAlignment(void);
+ BamAlignment(const BamAlignment& other);
+ ~BamAlignment(void);
+
+ // Queries against alignment flags
+ public:
+ bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate
+ bool IsFailedQC(void) const; // Returns true if this read failed quality control
+ bool IsFirstMate(void) const; // Returns true if alignment is first mate on read
+ bool IsMapped(void) const; // Returns true if alignment is mapped
+ bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped
+ bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand
+ bool IsPaired(void) const; // Returns true if alignment part of paired-end read
+ bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment
+ bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution
+ bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand
+ bool IsSecondMate(void) const; // Returns true if alignment is second mate on read
+
+ // Manipulate alignment flags
+ public:
+ void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag
+ void SetIsFailedQC(bool ok); // Sets "failed quality control" flag
+ void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag
+ void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag
+ void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag
+ void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag
+ void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag
+ void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag
+ void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag
+ void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag
+ void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag
+
+ // Tag data access methods
+ public:
+ // -------------------------------------------------------------------------------------
+ // N.B. - The following tag access methods may not be used on BamAlignments fetched
+ // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in
+ // error message (to keep output clean) but will ALWAYS return false. Only user-created
+ // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here.
+
+ // add tag data (create new TAG entry with TYPE and VALUE)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if new data added, false if error or TAG already exists
+ // N.B. - will NOT modify existing tag. Use EditTag() instead
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - value to associate with tag
+ bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if edit was successfaul, false if error
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - new value for tag
+ bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // specific tag data access methods - these only remain for legacy support
+ // returns whether specific tag could be retrieved
+ bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance))
+ bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup))
+
+ // generic tag data access methods
+ // returns whether tag is found & tag type is compatible with DESTINATION
+ // @tag - two character tag name
+ // @destination - if found, tag value is stored here
+ bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings
+ bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data
+ bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data
+ bool GetTag(const std::string& tag, float& destination) const; // access floating point data
+
+ // remove tag data
+ // returns true if removal was successful, false if error
+ // N.B. - returns false if TAG does not exist (no removal can occur)
+ // @tag - two character tag name
+ bool RemoveTag(const std::string& tag);
+
+ // Additional data access methods
+ public:
+ // calculates & returns alignment end position, based on starting position and CIGAR operations
+ // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference
+ // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats.
+ int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
+
+ // 'internal' utility methods
+ private:
+ static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);
+ static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);
+
+ // Data members
+ public:
+ std::string Name; // Read name
+ int32_t Length; // Query length
+ std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
+ std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
+ std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
+ std::string TagData; // Tag data (accessor methods will pull the requested information out)
+ int32_t RefID; // ID number for reference sequence
+ int32_t Position; // Position (0-based) where alignment starts
+ uint16_t Bin; // Bin in BAM file where this alignment resides
+ uint16_t MapQuality; // Mapping quality score
+ uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
+ std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
+ int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
+ int32_t MatePosition; // Position (0-based) where alignment's mate starts
+ int32_t InsertSize; // Mate-pair insert size
+
+ // internal data
+ private:
+ struct BamAlignmentSupportData {
+
+ // data members
+ std::string AllCharData;
+ uint32_t BlockLength;
+ uint32_t NumCigarOperations;
+ uint32_t QueryNameLength;
+ uint32_t QuerySequenceLength;
+ bool HasCoreOnly;
+
+ // constructor
+ BamAlignmentSupportData(void)
+ : BlockLength(0)
+ , NumCigarOperations(0)
+ , QueryNameLength(0)
+ , QuerySequenceLength(0)
+ , HasCoreOnly(false)
+ { }
+ };
+
+ // contains raw character data & lengths
+ BamAlignmentSupportData SupportData;
+
+ // allow these classes access to BamAlignment private members (SupportData)
+ // but client code should not need to touch this data
+ friend class BamReader;
+ friend class BamWriter;
+
+ // Alignment flag query constants
+ // Use the get/set methods above instead
+ private:
+ enum { PAIRED = 1
+ , PROPER_PAIR = 2
+ , UNMAPPED = 4
+ , MATE_UNMAPPED = 8
+ , REVERSE = 16
+ , MATE_REVERSE = 32
+ , READ_1 = 64
+ , READ_2 = 128
+ , SECONDARY = 256
+ , QC_FAILED = 512
+ , DUPLICATE = 1024
+ };
+};
+
+// convenience typedef(s)
+typedef std::vector<BamAlignment> BamAlignmentVector;
+
+} // namespace BamTools
+
+#endif // BAMALIGNMENT_H
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 16 September 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
-// Provides the basic constants, data structures, etc. for using BAM files\r
+// Provides the basic constants, data structures, utilities etc. \r
+// used throughout the API for handling BAM files\r
// ***************************************************************************\r
\r
#ifndef BAMAUX_H\r
#define BAMAUX_H\r
\r
-// C inclues\r
-#include <cctype>\r
-#include <cstdio>\r
-#include <cstdlib>\r
-#include <cstring>\r
-\r
-// C++ includes\r
-#include <exception>\r
-#include <fstream>\r
+#include <fstream> \r
#include <iostream>\r
-#include <map>\r
#include <string>\r
-#include <utility>\r
#include <vector>\r
\r
+// ----------------------------------------------------------------\r
+// ----------------------------------------------------------------\r
// Platform-specific type definitions\r
+\r
#ifndef BAMTOOLS_TYPES\r
#define BAMTOOLS_TYPES\r
#ifdef _MSC_VER\r
\r
namespace BamTools {\r
\r
+// ----------------------------------------------------------------\r
+// ----------------------------------------------------------------\r
// BAM constants\r
+\r
const int BAM_CMATCH = 0;\r
const int BAM_CINS = 1;\r
const int BAM_CDEL = 2;\r
const int BAM_CORE_SIZE = 32;\r
const int BT_SIZEOF_INT = 4;\r
\r
-struct CigarOp;\r
-\r
-struct BamAlignment {\r
-\r
- // constructors & destructor\r
- public:\r
- BamAlignment(void);\r
- BamAlignment(const BamAlignment& other);\r
- ~BamAlignment(void);\r
-\r
- // Queries against alignment flags\r
- public: \r
- bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate \r
- bool IsFailedQC(void) const; // Returns true if this read failed quality control \r
- bool IsFirstMate(void) const; // Returns true if alignment is first mate on read \r
- bool IsMapped(void) const; // Returns true if alignment is mapped \r
- bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped \r
- bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand \r
- bool IsPaired(void) const; // Returns true if alignment part of paired-end read \r
- bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment \r
- bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution \r
- bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand\r
- bool IsSecondMate(void) const; // Returns true if alignment is second mate on read\r
-\r
- // Manipulate alignment flags\r
- public: \r
- void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag \r
- void SetIsFailedQC(bool ok); // Sets "failed quality control" flag \r
- void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag \r
- void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag \r
- void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag \r
- void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag \r
- void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag \r
- void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag \r
- void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag \r
- void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag \r
- void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag\r
-\r
- // Tag data access methods\r
- public:\r
- // -------------------------------------------------------------------------------------\r
- // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched\r
- // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in \r
- // error message (to keep output clean) but will ALWAYS return false. Only user-\r
- // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid.\r
-\r
- // add tag data (create new TAG entry with TYPE and VALUE)\r
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
- // returns true if new data added, false if error or TAG already exists\r
- // N.B. - will NOT modify existing tag. Use EditTag() instead\r
- bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
- bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
- bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
- bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
- \r
- // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)\r
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
- // returns true if edit was successfaul, false if error\r
- bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
- bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
- bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
- bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
-\r
- // specific tag data access methods - these only remain for legacy support\r
- bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance))\r
- bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) \r
- \r
- // generic tag data access methods \r
- bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings \r
- bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data\r
- bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data\r
- bool GetTag(const std::string& tag, float& destination) const; // access floating point data\r
- \r
- // remove tag data\r
- // returns true if removal was successful, false if error\r
- // N.B. - returns false if TAG does not exist (no removal can occur)\r
- bool RemoveTag(const std::string& tag);\r
-\r
- // Additional data access methods\r
- public:\r
- // calculates alignment end position, based on starting position and CIGAR operations\r
- // @zeroBased - if true, returns 0-based coordinate; else returns 1-based\r
- int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; \r
-\r
- // 'internal' utility methods \r
- private:\r
- static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);\r
- static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);\r
-\r
- // Data members\r
- public:\r
- std::string Name; // Read name\r
- int32_t Length; // Query length\r
- std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)\r
- std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)\r
- std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)\r
- std::string TagData; // Tag data (accessor methods will pull the requested information out)\r
- int32_t RefID; // ID number for reference sequence\r
- int32_t Position; // Position (0-based) where alignment starts\r
- uint16_t Bin; // Bin in BAM file where this alignment resides\r
- uint16_t MapQuality; // Mapping quality score\r
- uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate \r
- std::vector<CigarOp> CigarData; // CIGAR operations for this alignment\r
- int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned\r
- int32_t MatePosition; // Position (0-based) where alignment's mate starts\r
- int32_t InsertSize; // Mate-pair insert size\r
- \r
- // internal data\r
- private:\r
- struct BamAlignmentSupportData {\r
- \r
- // data members\r
- std::string AllCharData;\r
- uint32_t BlockLength;\r
- uint32_t NumCigarOperations;\r
- uint32_t QueryNameLength;\r
- uint32_t QuerySequenceLength;\r
- bool HasCoreOnly;\r
- \r
- // constructor\r
- BamAlignmentSupportData(void)\r
- : BlockLength(0)\r
- , NumCigarOperations(0)\r
- , QueryNameLength(0)\r
- , QuerySequenceLength(0)\r
- , HasCoreOnly(false)\r
- { }\r
- };\r
- \r
- // contains raw character data & lengths\r
- BamAlignmentSupportData SupportData; \r
- \r
- // allow these classes access to BamAlignment private members (SupportData)\r
- // but client code should not need to touch this data\r
- friend class BamReader;\r
- friend class BamWriter;\r
-\r
- // Alignment flag query constants\r
- // Use the get/set methods above instead\r
- private:\r
- enum { PAIRED = 1\r
- , PROPER_PAIR = 2\r
- , UNMAPPED = 4\r
- , MATE_UNMAPPED = 8\r
- , REVERSE = 16\r
- , MATE_REVERSE = 32\r
- , READ_1 = 64\r
- , READ_2 = 128\r
- , SECONDARY = 256\r
- , QC_FAILED = 512\r
- , DUPLICATE = 1024 \r
- };\r
-};\r
-\r
// ----------------------------------------------------------------\r
-// Auxiliary data structs & typedefs\r
+// ----------------------------------------------------------------\r
+// Data structs & typedefs\r
\r
+// CIGAR operation data structure\r
struct CigarOp {\r
\r
// data members\r
{ }\r
};\r
\r
+// Reference data entry\r
struct RefData {\r
\r
// data members\r
, RefHasAlignments(ok)\r
{ }\r
};\r
+typedef std::vector<RefData> RefVector;\r
\r
-typedef std::vector<RefData> RefVector;\r
-typedef std::vector<BamAlignment> BamAlignmentVector;\r
-\r
+// General (sequential) genome region\r
struct BamRegion {\r
\r
// data members\r
};\r
\r
// ----------------------------------------------------------------\r
-// Added: 3-35-2010 DWB\r
-// Fixed: Routines to provide endian-correctness\r
// ----------------------------------------------------------------\r
+// General utilities \r
\r
// returns true if system is big endian\r
inline bool SystemIsBigEndian(void) {\r
SwapEndian_64(value);\r
}\r
\r
+// returns whether file exists (can be opened OK)\r
inline bool FileExists(const std::string& filename) {\r
std::ifstream f(filename.c_str(), std::ifstream::in);\r
return !f.fail();\r
}\r
\r
-// ----------------------------------------------------------------\r
-// BamAlignment member methods\r
-\r
-// constructors & destructor\r
-inline BamAlignment::BamAlignment(void) { }\r
-\r
-inline BamAlignment::BamAlignment(const BamAlignment& other)\r
- : Name(other.Name)\r
- , Length(other.Length)\r
- , QueryBases(other.QueryBases)\r
- , AlignedBases(other.AlignedBases)\r
- , Qualities(other.Qualities)\r
- , TagData(other.TagData)\r
- , RefID(other.RefID)\r
- , Position(other.Position)\r
- , Bin(other.Bin)\r
- , MapQuality(other.MapQuality)\r
- , AlignmentFlag(other.AlignmentFlag)\r
- , CigarData(other.CigarData)\r
- , MateRefID(other.MateRefID)\r
- , MatePosition(other.MatePosition)\r
- , InsertSize(other.InsertSize)\r
- , SupportData(other.SupportData)\r
-{ }\r
-\r
-inline BamAlignment::~BamAlignment(void) { }\r
-\r
-// Queries against alignment flags\r
-inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
-inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }\r
-inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
-inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
-inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
-inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
-inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }\r
-inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }\r
-inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }\r
-inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }\r
-inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
-\r
-// Manipulate alignment flags \r
-inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }\r
-inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }\r
-inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }\r
-inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }\r
-inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }\r
-inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }\r
-inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }\r
-inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }\r
-inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }\r
-inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }\r
-inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }\r
-\r
-// calculates alignment end position, based on starting position and CIGAR operations\r
-inline \r
-int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {\r
-\r
- // initialize alignment end to starting position\r
- int alignEnd = Position;\r
-\r
- // iterate over cigar operations\r
- std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();\r
- std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
- const char cigarType = (*cigarIter).Type;\r
- if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {\r
- alignEnd += (*cigarIter).Length;\r
- } \r
- else if ( usePadded && cigarType == 'I' ) {\r
- alignEnd += (*cigarIter).Length;\r
- }\r
- }\r
- \r
- // adjust for zeroBased, if necessary\r
- if (zeroBased) \r
- return alignEnd - 1;\r
- else \r
- return alignEnd;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type != "Z" && type != "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, copy tag data to temp buffer\r
- std::string newTag = tag + type + value;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "f" || type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, convert value to string\r
- union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = value;\r
-\r
- // copy original tag data to temp buffer\r
- std::string newTag = tag + type;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data());\r
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
- return AddTag(tag, type, (const uint32_t&)value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, convert value to string\r
- union { float value; char valueBuffer[sizeof(float)]; } un;\r
- un.value = value;\r
-\r
- // copy original tag data to temp buffer\r
- std::string newTag = tag + type;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data());\r
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type != "Z" && type != "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + value.size()]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- const unsigned int dataLength = strlen(value.c_str());\r
- memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "f" || type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + sizeof(value)]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = value;\r
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
- return EditTag(tag, type, (const uint32_t&)value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + sizeof(value)]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- union { float value; char valueBuffer[sizeof(float)]; } un;\r
- un.value = value;\r
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-// get "NM" tag data - originally contributed by Aaron Quinlan\r
-// stores data in 'editDistance', returns success/fail\r
-inline \r
-bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { \r
- return GetTag("NM", (uint32_t&)editDistance);\r
-}\r
-\r
-// get "RG" tag data\r
-// stores data in 'readGroup', returns success/fail\r
-inline \r
-bool BamAlignment::GetReadGroup(std::string& readGroup) const {\r
- return GetTag("RG", readGroup);\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {\r
-\r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- const unsigned int dataLength = strlen(pTagData);\r
- destination.clear();\r
- destination.resize(dataLength);\r
- memcpy( (char*)destination.data(), pTagData, dataLength );\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {\r
- \r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, determine data byte-length, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- \r
- // determine data byte-length\r
- const char type = *(pTagData - 1);\r
- int destinationLength = 0;\r
- switch (type) {\r
- // 1 byte data\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- destinationLength = 1;\r
- break;\r
-\r
- // 2 byte data\r
- case 's':\r
- case 'S':\r
- destinationLength = 2;\r
- break;\r
-\r
- // 4 byte data\r
- case 'i':\r
- case 'I':\r
- destinationLength = 4;\r
- break;\r
-\r
- // unsupported type for integer destination (float or var-length strings)\r
- case 'f':\r
- case 'Z':\r
- case 'H':\r
- fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);\r
- return false;\r
-\r
- // unknown tag type\r
- default:\r
- fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
- return false;\r
- }\r
- \r
- // store in destination\r
- destination = 0;\r
- memcpy(&destination, pTagData, destinationLength);\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {\r
- return GetTag(tag, (uint32_t&)destination);\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, float& destination) const {\r
- \r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, determine data byte-length, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- //pTagData += numBytesParsed;\r
- \r
- // determine data byte-length\r
- const char type = *(pTagData - 1);\r
- int destinationLength = 0;\r
- switch(type) {\r
-\r
- // 1 byte data\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- destinationLength = 1;\r
- break;\r
-\r
- // 2 byte data\r
- case 's':\r
- case 'S':\r
- destinationLength = 2;\r
- break;\r
-\r
- // 4 byte data\r
- case 'f':\r
- case 'i':\r
- case 'I':\r
- destinationLength = 4;\r
- break;\r
- \r
- // unsupported type (var-length strings)\r
- case 'Z':\r
- case 'H':\r
- fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);\r
- return false;\r
-\r
- // unknown tag type\r
- default:\r
- fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
- return false;\r
- }\r
- \r
- // store in destination\r
- destination = 0.0;\r
- memcpy(&destination, pTagData, destinationLength);\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::RemoveTag(const std::string& tag) {\r
- \r
- // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed\r
- // also, return false if no data present to remove\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- char newTagData[originalTagDataLength];\r
-\r
- // copy original tag data up til desired tag\r
- pTagData -= 3;\r
- numBytesParsed -= 3;\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData + 2;\r
- pTagData += 3;\r
- numBytesParsed += 3;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, no removal - return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {\r
-\r
- while ( numBytesParsed < tagDataLength ) {\r
-\r
- const char* pTagType = pTagData;\r
- const char* pTagStorageType = pTagData + 2;\r
- pTagData += 3;\r
- numBytesParsed += 3;\r
-\r
- // check the current tag, return true on match\r
- if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) \r
- return true;\r
-\r
- // get the storage class and find the next tag\r
- if ( *pTagStorageType == '\0' ) return false; \r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;\r
- if ( *pTagData == '\0' ) return false;\r
- }\r
- \r
- // checked all tags, none match\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
- \r
- switch(storageType) {\r
-\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- ++numBytesParsed;\r
- ++pTagData;\r
- break;\r
-\r
- case 's':\r
- case 'S':\r
- numBytesParsed += 2;\r
- pTagData += 2;\r
- break;\r
-\r
- case 'f':\r
- case 'i':\r
- case 'I':\r
- numBytesParsed += 4;\r
- pTagData += 4;\r
- break;\r
-\r
- case 'Z':\r
- case 'H':\r
- while(*pTagData) {\r
- ++numBytesParsed;\r
- ++pTagData;\r
- }\r
- // increment for null-terminator\r
- ++numBytesParsed;\r
- ++pTagData;\r
- break;\r
-\r
- default: \r
- // error case\r
- fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);\r
- return false;\r
- }\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
} // namespace BamTools\r
\r
#endif // BAMAUX_H\r
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 17 September 2010 (DB)
+// Last modified: 18 September 2010 (DB)
// ---------------------------------------------------------------------------
// Provides index functionality - both for the standardized BAM index format
// (".bai") as well as a BamTools-specific (nonstandard) index format (".bti").
#include <iostream>
#include <string>
#include <vector>
-#include "BamAux.h"
+#include "BamAlignment.h"
namespace BamTools {
// ***************************************************************************
-// BamMultiReader.cpp (c) 2010 Erik Garrison
+// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 3 September 2010 (DB)
+// Last modified: 18 September 2010 (DB)
// ---------------------------------------------------------------------------
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
// Institute.
// precludes the need to sort merged files.
// ***************************************************************************
-// C++ includes
#include <algorithm>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
-
-// BamTools includes
#include "BGZF.h"
#include "BamMultiReader.h"
using namespace BamTools;
}
// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
-bool BamMultiReader::CreateIndexes(bool useDefaultIndex) {
+bool BamMultiReader::CreateIndexes(bool useStandardIndex) {
bool result = true;
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
- result &= reader->CreateIndex(useDefaultIndex);
+ result &= reader->CreateIndex(useStandardIndex);
}
return result;
}
// foreach extraction entry (each BAM file)
for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
- map<string, bool> currentFileReadGroups;
-
BamReader* reader = rs->first;
-
- stringstream header(reader->GetHeaderText());
+ string headerText = reader->GetHeaderText();
+ if ( headerText.empty() ) continue;
+
+ map<string, bool> currentFileReadGroups;
+ stringstream header(headerText);
vector<string> lines;
string item;
while (getline(header, item))
}
// opens BAM files
-bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
+bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
// for filename in filenames
fileNames = filenames; // save filenames in our multireader
--- /dev/null
+// ***************************************************************************
+// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 18 September 2010 (DB)
+// ---------------------------------------------------------------------------
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files.
+//
+// This functionality allows applications to work on very large sets of files
+// without requiring intermediate merge, sort, and index steps for each file
+// subset. It also improves the performance of our merge system as it
+// precludes the need to sort merged files.
+// ***************************************************************************
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "BGZF.h"
+#include "BamMultiReader.h"
+using namespace BamTools;
+using namespace std;
+
+// -----------------------------------------------------
+// BamMultiReader implementation
+// -----------------------------------------------------
+
+// constructor
+BamMultiReader::BamMultiReader(void)
+ : CurrentRefID(0)
+ , CurrentLeft(0)
+{ }
+
+// destructor
+BamMultiReader::~BamMultiReader(void) {
+ Close();
+}
+
+// close the BAM files
+void BamMultiReader::Close(void) {
+
+ // close all BAM readers and clean up pointers
+ vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin();
+ vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd = readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter) {
+
+ BamReader* reader = (*readerIter).first;
+ BamAlignment* alignment = (*readerIter).second;
+
+ // close the reader
+ if ( reader) reader->Close();
+
+ // delete reader pointer
+ delete reader;
+ reader = 0;
+
+ // delete alignment pointer
+ delete alignment;
+ alignment = 0;
+ }
+
+ // clear out the container
+ readers.clear();
+}
+
+// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
+bool BamMultiReader::CreateIndexes(bool useStandardIndex) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->CreateIndex(useStandardIndex);
+ }
+ return result;
+}
+
+// for debugging
+void BamMultiReader::DumpAlignmentIndex(void) {
+ for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl;
+ }
+}
+
+// makes a virtual, unified header for all the bam files in the multireader
+const string BamMultiReader::GetHeaderText(void) const {
+
+ string mergedHeader = "";
+ map<string, bool> readGroups;
+
+ // foreach extraction entry (each BAM file)
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
+
+ BamReader* reader = rs->first;
+ const string headerText = reader->GetHeaderText();
+ if ( headerText.empty() ) continue;
+
+ map<string, bool> currentFileReadGroups;
+ stringstream header(headerText);
+ vector<string> lines;
+ string item;
+ while (getline(header, item))
+ lines.push_back(item);
+
+ for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+
+ // get next line from header, skip if empty
+ string headerLine = *it;
+ if ( headerLine.empty() ) { continue; }
+
+ // if first file, save HD & SQ entries
+ if ( rs == readers.begin() ) {
+ if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
+ mergedHeader.append(headerLine.c_str());
+ mergedHeader.append(1, '\n');
+ }
+ }
+
+ // (for all files) append RG entries if they are unique
+ if ( headerLine.find("@RG") == 0 ) {
+ stringstream headerLineSs(headerLine);
+ string part, readGroupPart, readGroup;
+ while(std::getline(headerLineSs, part, '\t')) {
+ stringstream partSs(part);
+ string subtag;
+ std::getline(partSs, subtag, ':');
+ if (subtag == "ID") {
+ std::getline(partSs, readGroup, ':');
+ break;
+ }
+ }
+ if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
+ mergedHeader.append(headerLine.c_str() );
+ mergedHeader.append(1, '\n');
+ readGroups[readGroup] = true;
+ currentFileReadGroups[readGroup] = true;
+ } else {
+ // warn iff we are reading one file and discover duplicated @RG tags in the header
+ // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
+ if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
+ cerr << "WARNING: duplicate @RG tag " << readGroup
+ << " entry in header of " << reader->GetFilename() << endl;
+ }
+ }
+ }
+ }
+ }
+
+ // return merged header text
+ return mergedHeader;
+}
+
+// get next alignment among all files
+bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignment(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+ //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignmentCore(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// ---------------------------------------------------------------------------------------
+//
+// NB: The following GetReferenceX() functions assume that we have identical
+// references for all BAM files. We enforce this by invoking the above
+// validation function (ValidateReaders) to verify that our reference data
+// is the same across all files on Open, so we will not encounter a situation
+// in which there is a mismatch and we are still live.
+//
+// ---------------------------------------------------------------------------------------
+
+// returns the number of reference sequences
+const int BamMultiReader::GetReferenceCount(void) const {
+ return readers.front().first->GetReferenceCount();
+}
+
+// returns vector of reference objects
+const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
+ return readers.front().first->GetReferenceData();
+}
+
+// returns refID from reference name
+const int BamMultiReader::GetReferenceID(const string& refName) const {
+ return readers.front().first->GetReferenceID(refName);
+}
+
+// ---------------------------------------------------------------------------------------
+
+// checks if any readers still have alignments
+bool BamMultiReader::HasOpenReaders() {
+ return alignments.size() > 0;
+}
+
+// returns whether underlying BAM readers ALL have an index loaded
+// this is useful to indicate whether Jump() or SetRegion() are possible
+bool BamMultiReader::IsIndexLoaded(void) const {
+ bool ok = true;
+ vector<pair<BamReader*, BamAlignment*> >::const_iterator readerIter = readers.begin();
+ vector<pair<BamReader*, BamAlignment*> >::const_iterator readerEnd = readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ const BamReader* reader = (*readerIter).first;
+ if ( reader ) ok &= reader->IsIndexLoaded();
+ }
+ return ok;
+}
+
+// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
+bool BamMultiReader::Jump(int refID, int position) {
+
+ //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
+ CurrentRefID = refID;
+ CurrentLeft = position;
+
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Jump(refID, position);
+ if (!result) {
+ cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
+ exit(1);
+ }
+ }
+ if (result) UpdateAlignments();
+ return result;
+}
+
+// opens BAM files
+bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
+
+ // for filename in filenames
+ fileNames = filenames; // save filenames in our multireader
+ for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
+
+ const string filename = *it;
+ BamReader* reader = new BamReader;
+
+ bool openedOK = true;
+ if (openIndexes) {
+
+ // leave index filename empty
+ // this allows BamReader & BamIndex to search for any available
+ // useDefaultIndex gives hint to prefer BAI over BTI
+ openedOK = reader->Open(filename, "", true, useDefaultIndex);
+ }
+
+ // ignoring index file(s)
+ else openedOK = reader->Open(filename);
+
+ // if file opened ok, check that it can be read
+ if ( openedOK ) {
+
+ bool fileOK = true;
+ BamAlignment* alignment = new BamAlignment;
+ fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) );
+
+ if (fileOK) {
+ readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else {
+ cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
+ // if only file available & could not be read, return failure
+ if ( filenames.size() == 1 ) return false;
+ }
+ }
+
+ // TODO; any further error handling when openedOK is false ??
+ else
+ return false;
+ }
+
+ // files opened ok, at least one alignment could be read,
+ // now need to check that all files use same reference data
+ ValidateReaders();
+ return true;
+}
+
+void BamMultiReader::PrintFilenames(void) {
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ cout << reader->GetFilename() << endl;
+ }
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReader::Rewind(void) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Rewind();
+ }
+ return result;
+}
+
+bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
+ BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
+ return SetRegion(region);
+}
+
+bool BamMultiReader::SetRegion(const BamRegion& region) {
+
+ Region = region;
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully SetRegion, In practice a failure of SetRegion means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ if (!it->first->SetRegion(region)) {
+ cerr << "ERROR: could not jump " << it->first->GetFilename() << " to "
+ << region.LeftRefID << ":" << region.LeftPosition
+ << ".." << region.RightRefID << ":" << region.RightPosition << endl;
+ }
+ }
+
+ UpdateAlignments();
+ return true;
+}
+
+void BamMultiReader::UpdateAlignments(void) {
+ // Update Alignments
+ alignments.clear();
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* br = it->first;
+ BamAlignment* ba = it->second;
+ if (br->GetNextAlignment(*ba)) {
+ alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
+ make_pair(br, ba)));
+ } else {
+ // assume BamReader end of region / EOF
+ }
+ }
+}
+
+// updates the reference id stored in the BamMultiReader
+// to reflect the current state of the readers
+void BamMultiReader::UpdateReferenceID(void) {
+ // the alignments are sorted by position, so the first alignment will always have the lowest reference ID
+ if (alignments.begin()->second.second->RefID != CurrentRefID) {
+ // get the next reference id
+ // while there aren't any readers at the next ref id
+ // increment the ref id
+ int nextRefID = CurrentRefID;
+ while (alignments.begin()->second.second->RefID != nextRefID) {
+ ++nextRefID;
+ }
+ //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
+ CurrentRefID = nextRefID;
+ }
+}
+
+// ValidateReaders checks that all the readers point to BAM files representing
+// alignments against the same set of reference sequences, and that the
+// sequences are identically ordered. If these checks fail the operation of
+// the multireader is undefined, so we force program exit.
+void BamMultiReader::ValidateReaders(void) const {
+ int firstRefCount = readers.front().first->GetReferenceCount();
+ BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ BamTools::RefVector currentRefData = reader->GetReferenceData();
+ BamTools::RefVector::const_iterator f = firstRefData.begin();
+ BamTools::RefVector::const_iterator c = currentRefData.begin();
+ if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
+ cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
+ << " expected " << firstRefCount
+ << " reference sequences but only found " << reader->GetReferenceCount() << endl;
+ exit(1);
+ }
+ // this will be ok; we just checked above that we have identically-sized sets of references
+ // here we simply check if they are all, in fact, equal in content
+ while (f != firstRefData.end()) {
+ if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
+ cerr << "ERROR: mismatched references found in " << reader->GetFilename()
+ << " expected: " << endl;
+ for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ cerr << "but found: " << endl;
+ for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ exit(1);
+ }
+ ++f; ++c;
+ }
+ }
+}
// ***************************************************************************\r
-// BamMultiReader.h (c) 2010 Erik Garrison\r
+// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 3 September 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Functionality for simultaneously reading multiple BAM files\r
// ***************************************************************************\r
#ifndef BAMMULTIREADER_H\r
#define BAMMULTIREADER_H\r
\r
-// C++ includes\r
#include <string>\r
#include <map>\r
-#include <utility> // for pair\r
+#include <utility>\r
#include <sstream>\r
-\r
-using namespace std;\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
#include "BamReader.h"\r
\r
namespace BamTools {\r
\r
// index mapping reference/position pairings to bamreaders and their alignments\r
-typedef multimap<pair<int, int>, pair<BamReader*, BamAlignment*> > AlignmentIndex;\r
-\r
+typedef std::multimap<std::pair<int, int>, std::pair<BamReader*, BamAlignment*> > AlignmentIndex;\r
\r
class BamMultiReader {\r
\r
// also useful for merging\r
// @preferStandardIndex - look for standard BAM index ".bai" first. If false, \r
// will look for BamTools index ".bti". \r
- bool Open(const vector<string> filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false);\r
+ bool Open(const std::vector<std::string>& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false);\r
\r
// returns whether underlying BAM readers ALL have an index loaded\r
// this is useful to indicate whether Jump() or SetRegion() are possible\r
// ----------------------\r
\r
// returns unified SAM header text for all files\r
- const string GetHeaderText(void) const;\r
+ const std::string GetHeaderText(void) const;\r
// returns number of reference sequences\r
const int GetReferenceCount(void) const;\r
// returns vector of reference objects\r
// ----------------------\r
\r
// creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")\r
- bool CreateIndexes(bool useDefaultIndex = true);\r
+ bool CreateIndexes(bool useStandardIndex = true);\r
\r
//const int GetReferenceID(const string& refName) const;\r
\r
private:\r
\r
// the set of readers and alignments which we operate on, maintained throughout the life of this class\r
- vector<pair<BamReader*, BamAlignment*> > readers;\r
+ std::vector<std::pair<BamReader*, BamAlignment*> > readers;\r
\r
// readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment\r
// when a reader reaches EOF, its entry is removed from this index\r
AlignmentIndex alignments;\r
\r
- vector<string> fileNames;\r
+ std::vector<std::string> fileNames;\r
};\r
\r
} // namespace BamTools\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 3 September 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
#ifndef BAMREADER_H\r
#define BAMREADER_H\r
\r
-// C++ includes\r
#include <string>\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
+#include "BamAlignment.h"\r
\r
namespace BamTools {\r
\r
bool IsIndexLoaded(void) const;\r
// returns whether reader is open for reading or not\r
bool IsOpen(void) const;\r
- // performs random-access jump to reference, position\r
+ // performs random-access jump using (reference, position) as a left-bound\r
bool Jump(int refID, int position = 0);\r
// opens BAM file (and optional BAM index file, if provided)\r
// @lookForIndex - if no indexFilename provided, look for an existing index file\r
bool GetNextAlignment(BamAlignment& bAlignment);\r
\r
// retrieves next available alignment core data (returns success/fail)\r
- // ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
- // these can be accessed, if necessary, from the supportData \r
- // useful for operations requiring ONLY positional or other alignment-related information\r
+ // ** DOES NOT parse any character data (read name, bases, qualities, tag data) **\r
+ // useful for operations requiring ONLY aligner-related information (refId/position, alignment flags, CIGAR, mapQuality, etc)\r
bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
\r
// ----------------------\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 17 August 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
#ifndef BAMWRITER_H\r
#define BAMWRITER_H\r
\r
-// C++ includes\r
#include <string>\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
+#include "BamAlignment.h"\r
\r
namespace BamTools {\r
\r