--- /dev/null
+// ***************************************************************************
+// BamAlignment.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 September 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <map>
+#include <utility>
+#include "BamAlignment.h"
+using namespace BamTools;
+using namespace std;
+
+// default ctor
+BamAlignment::BamAlignment(void)
+ : RefID(-1)
+ , Position(-1)
+ , MateRefID(-1)
+ , MatePosition(-1)
+ , InsertSize(0)
+{ }
+
+// copy ctor
+BamAlignment::BamAlignment(const BamAlignment& other)
+ : Name(other.Name)
+ , Length(other.Length)
+ , QueryBases(other.QueryBases)
+ , AlignedBases(other.AlignedBases)
+ , Qualities(other.Qualities)
+ , TagData(other.TagData)
+ , RefID(other.RefID)
+ , Position(other.Position)
+ , Bin(other.Bin)
+ , MapQuality(other.MapQuality)
+ , AlignmentFlag(other.AlignmentFlag)
+ , CigarData(other.CigarData)
+ , MateRefID(other.MateRefID)
+ , MatePosition(other.MatePosition)
+ , InsertSize(other.InsertSize)
+ , SupportData(other.SupportData)
+{ }
+
+// dtor
+BamAlignment::~BamAlignment(void) { }
+
+// Queries against alignment flags
+bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
+bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
+bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
+bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
+bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
+bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
+bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
+bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
+bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
+
+// Manipulate alignment flags
+void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
+void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
+void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
+void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
+void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
+void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
+void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
+void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
+void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
+void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
+void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
+
+// calculates alignment end position, based on starting position and CIGAR operations
+int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
+
+ // initialize alignment end to starting position
+ int alignEnd = Position;
+
+ // iterate over cigar operations
+ vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {
+ const char cigarType = (*cigarIter).Type;
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' )
+ alignEnd += (*cigarIter).Length;
+ else if ( usePadded && cigarType == 'I' )
+ alignEnd += (*cigarIter).Length;
+ }
+
+ // adjust for zeroBased, if necessary
+ if (zeroBased)
+ return alignEnd - 1;
+ else
+ return alignEnd;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, copy tag data to temp buffer
+ string newTag = tag + type + value;
+ const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) {
+ return AddTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + value.size()];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ const unsigned int dataLength = strlen(value.c_str());
+ memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) {
+ return EditTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+// get "NM" tag data - originally contributed by Aaron Quinlan
+// stores data in 'editDistance', returns success/fail
+bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
+ return GetTag("NM", (uint32_t&)editDistance);
+}
+
+// get "RG" tag data
+// stores data in 'readGroup', returns success/fail
+bool BamAlignment::GetReadGroup(string& readGroup) const {
+ return GetTag("RG", readGroup);
+}
+
+bool BamAlignment::GetTag(const string& tag, string& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+ const unsigned int dataLength = strlen(pTagData);
+ destination.clear();
+ destination.resize(dataLength);
+ memcpy( (char*)destination.data(), pTagData, dataLength );
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch (type) {
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type for integer destination (float or var-length strings)
+ case 'f':
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const string& tag, int32_t& destination) const {
+ return GetTag(tag, (uint32_t&)destination);
+}
+
+bool BamAlignment::GetTag(const string& tag, float& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch(type) {
+
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'f':
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type (var-length strings)
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0.0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTagType(const string& tag, char& type) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // lookup tag
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // retrieve tag type code
+ type = *(pTagData - 1);
+
+ // validate that type is a proper BAM tag type
+ switch(type) {
+ case 'A':
+ case 'c':
+ case 'C':
+ case 's':
+ case 'S':
+ case 'f':
+ case 'i':
+ case 'I':
+ case 'Z':
+ case 'H':
+ return true;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::RemoveTag(const string& tag) {
+
+ // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed
+ // also, return false if no data present to remove
+ if ( SupportData.HasCoreOnly || TagData.empty() ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ char newTagData[originalTagDataLength];
+
+ // copy original tag data up til desired tag
+ pTagData -= 3;
+ numBytesParsed -= 3;
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
+
+ // save new tag data
+ TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, no removal - return failure
+ return false;
+}
+
+bool BamAlignment::FindTag(const string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {
+
+ while ( numBytesParsed < tagDataLength ) {
+
+ const char* pTagType = pTagData;
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+
+ // check the current tag, return true on match
+ if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
+ return true;
+
+ // get the storage class and find the next tag
+ if ( *pTagStorageType == '\0' ) return false;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
+ if ( *pTagData == '\0' ) return false;
+ }
+
+ // checked all tags, none match
+ return false;
+}
+
+bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
+
+ switch(storageType) {
+
+ case 'A':
+ case 'c':
+ case 'C':
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case 's':
+ case 'S':
+ numBytesParsed += 2;
+ pTagData += 2;
+ break;
+
+ case 'f':
+ case 'i':
+ case 'I':
+ numBytesParsed += 4;
+ pTagData += 4;
+ break;
+
+ case 'Z':
+ case 'H':
+ while(*pTagData) {
+ ++numBytesParsed;
+ ++pTagData;
+ }
+ // increment for null-terminator
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ default:
+ // error case
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);
+ return false;
+ }
+
+ // return success
+ return true;
+}
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// BamAlignment.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 September 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#ifndef BAMALIGNMENT_H
+#define BAMALIGNMENT_H
+
+#include <string>
+#include <vector>
+#include "BamAux.h"
+
+namespace BamTools {
+
+// BamAlignment data structure
+// explicitly labeled as 'struct' to indicate that (most of) its fields are public
+struct BamAlignment {
+
+ // constructors & destructor
+ public:
+ BamAlignment(void);
+ BamAlignment(const BamAlignment& other);
+ ~BamAlignment(void);
+
+ // Queries against alignment flags
+ public:
+ bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate
+ bool IsFailedQC(void) const; // Returns true if this read failed quality control
+ bool IsFirstMate(void) const; // Returns true if alignment is first mate on read
+ bool IsMapped(void) const; // Returns true if alignment is mapped
+ bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped
+ bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand
+ bool IsPaired(void) const; // Returns true if alignment part of paired-end read
+ bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment
+ bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution
+ bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand
+ bool IsSecondMate(void) const; // Returns true if alignment is second mate on read
+
+ // Manipulate alignment flags
+ public:
+ void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag
+ void SetIsFailedQC(bool ok); // Sets "failed quality control" flag
+ void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag
+ void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag
+ void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag
+ void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag
+ void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag
+ void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag
+ void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag
+ void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag
+ void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag
+
+ // Tag data access methods
+ public:
+ // -------------------------------------------------------------------------------------
+ // N.B. - The following tag access methods may not be used on BamAlignments fetched
+ // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in
+ // error message (to keep output clean) but will ALWAYS return false. Only user-created
+ // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here.
+
+ // add tag data (create new TAG entry with TYPE and VALUE)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if new data added, false if error or TAG already exists
+ // N.B. - will NOT modify existing tag. Use EditTag() instead
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - value to associate with tag
+ bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if edit was successfaul, false if error
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - new value for tag
+ bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // specific tag data access methods - these only remain for legacy support
+ // returns whether specific tag could be retrieved
+ bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance))
+ bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup))
+
+ // generic tag data access methods
+ // returns whether tag is found & tag type is compatible with DESTINATION
+ // @tag - two character tag name
+ // @destination - if found, tag value is stored here
+ bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings
+ bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data
+ bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data
+ bool GetTag(const std::string& tag, float& destination) const; // access floating point data
+
+ // retrieve the tag type code for TAG
+ // returns true if tag could be found and type determined
+ bool GetTagType(const std::string& tag, char& type) const;
+
+ // remove tag data
+ // returns true if removal was successful, false if error
+ // N.B. - returns false if TAG does not exist (no removal can occur)
+ // @tag - two character tag name
+ bool RemoveTag(const std::string& tag);
+
+ // Additional data access methods
+ public:
+ // calculates & returns alignment end position, based on starting position and CIGAR operations
+ // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference
+ // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats.
+ int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
+
+ // 'internal' utility methods
+ private:
+ static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);
+ static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);
+
+ // Data members
+ public:
+ std::string Name; // Read name
+ int32_t Length; // Query length
+ std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
+ std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
+ std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
+ std::string TagData; // Tag data (accessor methods will pull the requested information out)
+ int32_t RefID; // ID number for reference sequence
+ int32_t Position; // Position (0-based) where alignment starts
+ uint16_t Bin; // Bin in BAM file where this alignment resides
+ uint16_t MapQuality; // Mapping quality score
+ uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
+ std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
+ int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
+ int32_t MatePosition; // Position (0-based) where alignment's mate starts
+ int32_t InsertSize; // Mate-pair insert size
+
+ // internal data
+ private:
+ struct BamAlignmentSupportData {
+
+ // data members
+ std::string AllCharData;
+ uint32_t BlockLength;
+ uint32_t NumCigarOperations;
+ uint32_t QueryNameLength;
+ uint32_t QuerySequenceLength;
+ bool HasCoreOnly;
+
+ // constructor
+ BamAlignmentSupportData(void)
+ : BlockLength(0)
+ , NumCigarOperations(0)
+ , QueryNameLength(0)
+ , QuerySequenceLength(0)
+ , HasCoreOnly(false)
+ { }
+ };
+
+ // contains raw character data & lengths
+ BamAlignmentSupportData SupportData;
+
+ // allow these classes access to BamAlignment private members (SupportData)
+ // but client code should not need to touch this data
+ friend class BamReader;
+ friend class BamWriter;
+
+ // Alignment flag query constants
+ // Use the get/set methods above instead
+ private:
+ enum { PAIRED = 1
+ , PROPER_PAIR = 2
+ , UNMAPPED = 4
+ , MATE_UNMAPPED = 8
+ , REVERSE = 16
+ , MATE_REVERSE = 32
+ , READ_1 = 64
+ , READ_2 = 128
+ , SECONDARY = 256
+ , QC_FAILED = 512
+ , DUPLICATE = 1024
+ };
+};
+
+// convenience typedef(s)
+typedef std::vector<BamAlignment> BamAlignmentVector;
+
+} // namespace BamTools
+
+#endif // BAMALIGNMENT_H
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 16 September 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
-// Provides the basic constants, data structures, etc. for using BAM files\r
+// Provides the basic constants, data structures, utilities etc. \r
+// used throughout the API for handling BAM files\r
// ***************************************************************************\r
\r
#ifndef BAMAUX_H\r
#define BAMAUX_H\r
\r
-// C inclues\r
-#include <cctype>\r
-#include <cstdio>\r
-#include <cstdlib>\r
-#include <cstring>\r
-\r
-// C++ includes\r
-#include <exception>\r
-#include <fstream>\r
+#include <fstream> \r
#include <iostream>\r
-#include <map>\r
#include <string>\r
-#include <utility>\r
#include <vector>\r
\r
+// ----------------------------------------------------------------\r
+// ----------------------------------------------------------------\r
// Platform-specific type definitions\r
+\r
#ifndef BAMTOOLS_TYPES\r
#define BAMTOOLS_TYPES\r
#ifdef _MSC_VER\r
\r
namespace BamTools {\r
\r
+// ----------------------------------------------------------------\r
+// ----------------------------------------------------------------\r
// BAM constants\r
+\r
const int BAM_CMATCH = 0;\r
const int BAM_CINS = 1;\r
const int BAM_CDEL = 2;\r
const int BAM_CORE_SIZE = 32;\r
const int BT_SIZEOF_INT = 4;\r
\r
-struct CigarOp;\r
-\r
-struct BamAlignment {\r
-\r
- // constructors & destructor\r
- public:\r
- BamAlignment(void);\r
- BamAlignment(const BamAlignment& other);\r
- ~BamAlignment(void);\r
-\r
- // Queries against alignment flags\r
- public: \r
- bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate \r
- bool IsFailedQC(void) const; // Returns true if this read failed quality control \r
- bool IsFirstMate(void) const; // Returns true if alignment is first mate on read \r
- bool IsMapped(void) const; // Returns true if alignment is mapped \r
- bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped \r
- bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand \r
- bool IsPaired(void) const; // Returns true if alignment part of paired-end read \r
- bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment \r
- bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution \r
- bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand\r
- bool IsSecondMate(void) const; // Returns true if alignment is second mate on read\r
-\r
- // Manipulate alignment flags\r
- public: \r
- void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag \r
- void SetIsFailedQC(bool ok); // Sets "failed quality control" flag \r
- void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag \r
- void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag \r
- void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag \r
- void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag \r
- void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag \r
- void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag \r
- void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag \r
- void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag \r
- void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag\r
-\r
- // Tag data access methods\r
- public:\r
- // -------------------------------------------------------------------------------------\r
- // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched\r
- // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in \r
- // error message (to keep output clean) but will ALWAYS return false. Only user-\r
- // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid.\r
-\r
- // add tag data (create new TAG entry with TYPE and VALUE)\r
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
- // returns true if new data added, false if error or TAG already exists\r
- // N.B. - will NOT modify existing tag. Use EditTag() instead\r
- bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
- bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
- bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
- bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
- \r
- // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)\r
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
- // returns true if edit was successfaul, false if error\r
- bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
- bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
- bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
- bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
-\r
- // specific tag data access methods - these only remain for legacy support\r
- bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance))\r
- bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) \r
- \r
- // generic tag data access methods \r
- bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings \r
- bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data\r
- bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data\r
- bool GetTag(const std::string& tag, float& destination) const; // access floating point data\r
- \r
- // remove tag data\r
- // returns true if removal was successful, false if error\r
- // N.B. - returns false if TAG does not exist (no removal can occur)\r
- bool RemoveTag(const std::string& tag);\r
-\r
- // Additional data access methods\r
- public:\r
- // calculates alignment end position, based on starting position and CIGAR operations\r
- // @zeroBased - if true, returns 0-based coordinate; else returns 1-based\r
- int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; \r
-\r
- // 'internal' utility methods \r
- private:\r
- static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);\r
- static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);\r
-\r
- // Data members\r
- public:\r
- std::string Name; // Read name\r
- int32_t Length; // Query length\r
- std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)\r
- std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)\r
- std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)\r
- std::string TagData; // Tag data (accessor methods will pull the requested information out)\r
- int32_t RefID; // ID number for reference sequence\r
- int32_t Position; // Position (0-based) where alignment starts\r
- uint16_t Bin; // Bin in BAM file where this alignment resides\r
- uint16_t MapQuality; // Mapping quality score\r
- uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate \r
- std::vector<CigarOp> CigarData; // CIGAR operations for this alignment\r
- int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned\r
- int32_t MatePosition; // Position (0-based) where alignment's mate starts\r
- int32_t InsertSize; // Mate-pair insert size\r
- \r
- // internal data\r
- private:\r
- struct BamAlignmentSupportData {\r
- \r
- // data members\r
- std::string AllCharData;\r
- uint32_t BlockLength;\r
- uint32_t NumCigarOperations;\r
- uint32_t QueryNameLength;\r
- uint32_t QuerySequenceLength;\r
- bool HasCoreOnly;\r
- \r
- // constructor\r
- BamAlignmentSupportData(void)\r
- : BlockLength(0)\r
- , NumCigarOperations(0)\r
- , QueryNameLength(0)\r
- , QuerySequenceLength(0)\r
- , HasCoreOnly(false)\r
- { }\r
- };\r
- \r
- // contains raw character data & lengths\r
- BamAlignmentSupportData SupportData; \r
- \r
- // allow these classes access to BamAlignment private members (SupportData)\r
- // but client code should not need to touch this data\r
- friend class BamReader;\r
- friend class BamWriter;\r
-\r
- // Alignment flag query constants\r
- // Use the get/set methods above instead\r
- private:\r
- enum { PAIRED = 1\r
- , PROPER_PAIR = 2\r
- , UNMAPPED = 4\r
- , MATE_UNMAPPED = 8\r
- , REVERSE = 16\r
- , MATE_REVERSE = 32\r
- , READ_1 = 64\r
- , READ_2 = 128\r
- , SECONDARY = 256\r
- , QC_FAILED = 512\r
- , DUPLICATE = 1024 \r
- };\r
-};\r
-\r
// ----------------------------------------------------------------\r
-// Auxiliary data structs & typedefs\r
+// ----------------------------------------------------------------\r
+// Data structs & typedefs\r
\r
+// CIGAR operation data structure\r
struct CigarOp {\r
\r
// data members\r
{ }\r
};\r
\r
+// Reference data entry\r
struct RefData {\r
\r
// data members\r
, RefHasAlignments(ok)\r
{ }\r
};\r
+typedef std::vector<RefData> RefVector;\r
\r
-typedef std::vector<RefData> RefVector;\r
-typedef std::vector<BamAlignment> BamAlignmentVector;\r
-\r
+// General (sequential) genome region\r
struct BamRegion {\r
\r
// data members\r
};\r
\r
// ----------------------------------------------------------------\r
-// Added: 3-35-2010 DWB\r
-// Fixed: Routines to provide endian-correctness\r
// ----------------------------------------------------------------\r
+// General utilities \r
\r
// returns true if system is big endian\r
inline bool SystemIsBigEndian(void) {\r
SwapEndian_64(value);\r
}\r
\r
+// returns whether file exists (can be opened OK)\r
inline bool FileExists(const std::string& filename) {\r
std::ifstream f(filename.c_str(), std::ifstream::in);\r
return !f.fail();\r
}\r
\r
-// ----------------------------------------------------------------\r
-// BamAlignment member methods\r
-\r
-// constructors & destructor\r
-inline BamAlignment::BamAlignment(void) { }\r
-\r
-inline BamAlignment::BamAlignment(const BamAlignment& other)\r
- : Name(other.Name)\r
- , Length(other.Length)\r
- , QueryBases(other.QueryBases)\r
- , AlignedBases(other.AlignedBases)\r
- , Qualities(other.Qualities)\r
- , TagData(other.TagData)\r
- , RefID(other.RefID)\r
- , Position(other.Position)\r
- , Bin(other.Bin)\r
- , MapQuality(other.MapQuality)\r
- , AlignmentFlag(other.AlignmentFlag)\r
- , CigarData(other.CigarData)\r
- , MateRefID(other.MateRefID)\r
- , MatePosition(other.MatePosition)\r
- , InsertSize(other.InsertSize)\r
- , SupportData(other.SupportData)\r
-{ }\r
-\r
-inline BamAlignment::~BamAlignment(void) { }\r
-\r
-// Queries against alignment flags\r
-inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
-inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }\r
-inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
-inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
-inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
-inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
-inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }\r
-inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }\r
-inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }\r
-inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }\r
-inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
-\r
-// Manipulate alignment flags \r
-inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }\r
-inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }\r
-inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }\r
-inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }\r
-inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }\r
-inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }\r
-inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }\r
-inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }\r
-inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }\r
-inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }\r
-inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }\r
-\r
-// calculates alignment end position, based on starting position and CIGAR operations\r
-inline \r
-int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {\r
-\r
- // initialize alignment end to starting position\r
- int alignEnd = Position;\r
-\r
- // iterate over cigar operations\r
- std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();\r
- std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
- const char cigarType = (*cigarIter).Type;\r
- if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {\r
- alignEnd += (*cigarIter).Length;\r
- } \r
- else if ( usePadded && cigarType == 'I' ) {\r
- alignEnd += (*cigarIter).Length;\r
- }\r
- }\r
- \r
- // adjust for zeroBased, if necessary\r
- if (zeroBased) \r
- return alignEnd - 1;\r
- else \r
- return alignEnd;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type != "Z" && type != "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, copy tag data to temp buffer\r
- std::string newTag = tag + type + value;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "f" || type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, convert value to string\r
- union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = value;\r
-\r
- // copy original tag data to temp buffer\r
- std::string newTag = tag + type;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data());\r
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
- return AddTag(tag, type, (const uint32_t&)value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, convert value to string\r
- union { float value; char valueBuffer[sizeof(float)]; } un;\r
- un.value = value;\r
-\r
- // copy original tag data to temp buffer\r
- std::string newTag = tag + type;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data());\r
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type != "Z" && type != "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + value.size()]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- const unsigned int dataLength = strlen(value.c_str());\r
- memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "f" || type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + sizeof(value)]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = value;\r
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
- return EditTag(tag, type, (const uint32_t&)value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + sizeof(value)]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- union { float value; char valueBuffer[sizeof(float)]; } un;\r
- un.value = value;\r
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-// get "NM" tag data - originally contributed by Aaron Quinlan\r
-// stores data in 'editDistance', returns success/fail\r
-inline \r
-bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { \r
- return GetTag("NM", (uint32_t&)editDistance);\r
-}\r
-\r
-// get "RG" tag data\r
-// stores data in 'readGroup', returns success/fail\r
-inline \r
-bool BamAlignment::GetReadGroup(std::string& readGroup) const {\r
- return GetTag("RG", readGroup);\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {\r
-\r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- const unsigned int dataLength = strlen(pTagData);\r
- destination.clear();\r
- destination.resize(dataLength);\r
- memcpy( (char*)destination.data(), pTagData, dataLength );\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {\r
- \r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, determine data byte-length, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- \r
- // determine data byte-length\r
- const char type = *(pTagData - 1);\r
- int destinationLength = 0;\r
- switch (type) {\r
- // 1 byte data\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- destinationLength = 1;\r
- break;\r
-\r
- // 2 byte data\r
- case 's':\r
- case 'S':\r
- destinationLength = 2;\r
- break;\r
-\r
- // 4 byte data\r
- case 'i':\r
- case 'I':\r
- destinationLength = 4;\r
- break;\r
-\r
- // unsupported type for integer destination (float or var-length strings)\r
- case 'f':\r
- case 'Z':\r
- case 'H':\r
- fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);\r
- return false;\r
-\r
- // unknown tag type\r
- default:\r
- fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
- return false;\r
- }\r
- \r
- // store in destination\r
- destination = 0;\r
- memcpy(&destination, pTagData, destinationLength);\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {\r
- return GetTag(tag, (uint32_t&)destination);\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, float& destination) const {\r
- \r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, determine data byte-length, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- //pTagData += numBytesParsed;\r
- \r
- // determine data byte-length\r
- const char type = *(pTagData - 1);\r
- int destinationLength = 0;\r
- switch(type) {\r
-\r
- // 1 byte data\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- destinationLength = 1;\r
- break;\r
-\r
- // 2 byte data\r
- case 's':\r
- case 'S':\r
- destinationLength = 2;\r
- break;\r
-\r
- // 4 byte data\r
- case 'f':\r
- case 'i':\r
- case 'I':\r
- destinationLength = 4;\r
- break;\r
- \r
- // unsupported type (var-length strings)\r
- case 'Z':\r
- case 'H':\r
- fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);\r
- return false;\r
-\r
- // unknown tag type\r
- default:\r
- fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
- return false;\r
- }\r
- \r
- // store in destination\r
- destination = 0.0;\r
- memcpy(&destination, pTagData, destinationLength);\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::RemoveTag(const std::string& tag) {\r
- \r
- // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed\r
- // also, return false if no data present to remove\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- char newTagData[originalTagDataLength];\r
-\r
- // copy original tag data up til desired tag\r
- pTagData -= 3;\r
- numBytesParsed -= 3;\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData + 2;\r
- pTagData += 3;\r
- numBytesParsed += 3;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, no removal - return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {\r
-\r
- while ( numBytesParsed < tagDataLength ) {\r
-\r
- const char* pTagType = pTagData;\r
- const char* pTagStorageType = pTagData + 2;\r
- pTagData += 3;\r
- numBytesParsed += 3;\r
-\r
- // check the current tag, return true on match\r
- if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) \r
- return true;\r
-\r
- // get the storage class and find the next tag\r
- if ( *pTagStorageType == '\0' ) return false; \r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;\r
- if ( *pTagData == '\0' ) return false;\r
- }\r
- \r
- // checked all tags, none match\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
- \r
- switch(storageType) {\r
-\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- ++numBytesParsed;\r
- ++pTagData;\r
- break;\r
-\r
- case 's':\r
- case 'S':\r
- numBytesParsed += 2;\r
- pTagData += 2;\r
- break;\r
-\r
- case 'f':\r
- case 'i':\r
- case 'I':\r
- numBytesParsed += 4;\r
- pTagData += 4;\r
- break;\r
-\r
- case 'Z':\r
- case 'H':\r
- while(*pTagData) {\r
- ++numBytesParsed;\r
- ++pTagData;\r
- }\r
- // increment for null-terminator\r
- ++numBytesParsed;\r
- ++pTagData;\r
- break;\r
-\r
- default: \r
- // error case\r
- fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);\r
- return false;\r
- }\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
} // namespace BamTools\r
\r
#endif // BAMAUX_H\r
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 17 September 2010 (DB)
+// Last modified: 18 September 2010 (DB)
// ---------------------------------------------------------------------------
// Provides index functionality - both for the standardized BAM index format
// (".bai") as well as a BamTools-specific (nonstandard) index format (".bti").
#include <iostream>
#include <string>
#include <vector>
-#include "BamAux.h"
+#include "BamAlignment.h"
namespace BamTools {
// ***************************************************************************
-// BamMultiReader.cpp (c) 2010 Erik Garrison
+// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 3 September 2010 (DB)
+// Last modified: 18 September 2010 (DB)
// ---------------------------------------------------------------------------
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
// Institute.
// precludes the need to sort merged files.
// ***************************************************************************
-// C++ includes
#include <algorithm>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
-
-// BamTools includes
#include "BGZF.h"
#include "BamMultiReader.h"
using namespace BamTools;
}
// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
-bool BamMultiReader::CreateIndexes(bool useDefaultIndex) {
+bool BamMultiReader::CreateIndexes(bool useStandardIndex) {
bool result = true;
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
- result &= reader->CreateIndex(useDefaultIndex);
+ result &= reader->CreateIndex(useStandardIndex);
}
return result;
}
// foreach extraction entry (each BAM file)
for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
- map<string, bool> currentFileReadGroups;
-
BamReader* reader = rs->first;
-
- stringstream header(reader->GetHeaderText());
+ string headerText = reader->GetHeaderText();
+ if ( headerText.empty() ) continue;
+
+ map<string, bool> currentFileReadGroups;
+ stringstream header(headerText);
vector<string> lines;
string item;
while (getline(header, item))
}
// opens BAM files
-bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
+bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
// for filename in filenames
fileNames = filenames; // save filenames in our multireader
// ***************************************************************************\r
-// BamMultiReader.h (c) 2010 Erik Garrison\r
+// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 3 September 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Functionality for simultaneously reading multiple BAM files\r
// ***************************************************************************\r
#ifndef BAMMULTIREADER_H\r
#define BAMMULTIREADER_H\r
\r
-// C++ includes\r
#include <string>\r
#include <map>\r
-#include <utility> // for pair\r
+#include <utility>\r
#include <sstream>\r
-\r
-using namespace std;\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
#include "BamReader.h"\r
\r
namespace BamTools {\r
\r
// index mapping reference/position pairings to bamreaders and their alignments\r
-typedef multimap<pair<int, int>, pair<BamReader*, BamAlignment*> > AlignmentIndex;\r
-\r
+typedef std::multimap<std::pair<int, int>, std::pair<BamReader*, BamAlignment*> > AlignmentIndex;\r
\r
class BamMultiReader {\r
\r
// also useful for merging\r
// @preferStandardIndex - look for standard BAM index ".bai" first. If false, \r
// will look for BamTools index ".bti". \r
- bool Open(const vector<string> filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false);\r
+ bool Open(const std::vector<std::string>& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false);\r
\r
// returns whether underlying BAM readers ALL have an index loaded\r
// this is useful to indicate whether Jump() or SetRegion() are possible\r
// ----------------------\r
\r
// returns unified SAM header text for all files\r
- const string GetHeaderText(void) const;\r
+ const std::string GetHeaderText(void) const;\r
// returns number of reference sequences\r
const int GetReferenceCount(void) const;\r
// returns vector of reference objects\r
// ----------------------\r
\r
// creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")\r
- bool CreateIndexes(bool useDefaultIndex = true);\r
+ bool CreateIndexes(bool useStandardIndex = true);\r
\r
//const int GetReferenceID(const string& refName) const;\r
\r
private:\r
\r
// the set of readers and alignments which we operate on, maintained throughout the life of this class\r
- vector<pair<BamReader*, BamAlignment*> > readers;\r
+ std::vector<std::pair<BamReader*, BamAlignment*> > readers;\r
\r
// readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment\r
// when a reader reaches EOF, its entry is removed from this index\r
AlignmentIndex alignments;\r
\r
- vector<string> fileNames;\r
+ std::vector<std::string> fileNames;\r
};\r
\r
} // namespace BamTools\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 3 September 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
#ifndef BAMREADER_H\r
#define BAMREADER_H\r
\r
-// C++ includes\r
#include <string>\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
+#include "BamAlignment.h"\r
\r
namespace BamTools {\r
\r
bool IsIndexLoaded(void) const;\r
// returns whether reader is open for reading or not\r
bool IsOpen(void) const;\r
- // performs random-access jump to reference, position\r
+ // performs random-access jump using (reference, position) as a left-bound\r
bool Jump(int refID, int position = 0);\r
// opens BAM file (and optional BAM index file, if provided)\r
// @lookForIndex - if no indexFilename provided, look for an existing index file\r
bool GetNextAlignment(BamAlignment& bAlignment);\r
\r
// retrieves next available alignment core data (returns success/fail)\r
- // ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
- // these can be accessed, if necessary, from the supportData \r
- // useful for operations requiring ONLY positional or other alignment-related information\r
+ // ** DOES NOT parse any character data (read name, bases, qualities, tag data) **\r
+ // useful for operations requiring ONLY aligner-related information (refId/position, alignment flags, CIGAR, mapQuality, etc)\r
bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
\r
// ----------------------\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 17 August 2010 (DB)\r
+// Last modified: 18 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
#ifndef BAMWRITER_H\r
#define BAMWRITER_H\r
\r
-// C++ includes\r
#include <string>\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
+#include "BamAlignment.h"\r
\r
namespace BamTools {\r
\r
# define our source and object files
# ----------------------------------
SOURCES = BGZF.cpp \
+ BamAlignment.cpp \
BamIndex.cpp \
BamReader.cpp \
BamMultiReader.cpp \
bamtools_merge.cpp \
bamtools_random.cpp \
bamtools_sort.cpp \
+ bamtools_split.cpp \
bamtools_stats.cpp \
bamtools.cpp
OBJECTS= $(SOURCES:.cpp=.o)
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 22 July 2010
+// Last modified: 19 September 2010
// ---------------------------------------------------------------------------
// Integrates a number of BamTools functionalities into a single executable.
// ***************************************************************************
-// Std C/C++ includes
#include <iostream>
-
-// BamTools includes
#include "bamtools_convert.h"
#include "bamtools_count.h"
#include "bamtools_coverage.h"
#include "bamtools_merge.h"
#include "bamtools_random.h"
#include "bamtools_sort.h"
+#include "bamtools_split.h"
#include "bamtools_stats.h"
-
using namespace std;
using namespace BamTools;
static const string MERGE = "merge";
static const string RANDOM = "random";
static const string SORT = "sort";
+static const string SPLIT = "split";
static const string STATS = "stats";
// ------------------------------------------
static const string LONG_VERSION = "--version";
static const string SHORT_VERSION = "-v";
+// ------------------------------------------
+// Subtool factory method
+AbstractTool* CreateTool(const string& arg) {
+
+ // determine tool type based on arg
+ if ( arg == CONVERT ) return new ConvertTool;
+ if ( arg == COUNT ) return new CountTool;
+ if ( arg == COVERAGE ) return new CoverageTool;
+ if ( arg == FILTER ) return new FilterTool;
+ if ( arg == HEADER ) return new HeaderTool;
+ if ( arg == INDEX ) return new IndexTool;
+ if ( arg == MERGE ) return new MergeTool;
+ if ( arg == RANDOM ) return new RandomTool;
+ if ( arg == SORT ) return new SortTool;
+ if ( arg == SPLIT ) return new SplitTool;
+ if ( arg == STATS ) return new StatsTool;
+
+ // unknown arg
+ return 0;
+}
+
// ------------------------------------------
// Print help info
int Help(int argc, char* argv[]) {
// 'bamtools help COMMAND'
if (argc > 2) {
- AbstractTool* tool(0);
- if ( argv[2] == CONVERT ) tool = new ConvertTool;
- if ( argv[2] == COUNT ) tool = new CountTool;
- if ( argv[2] == COVERAGE ) tool = new CoverageTool;
- if ( argv[2] == FILTER ) tool = new FilterTool;
- if ( argv[2] == HEADER ) tool = new HeaderTool;
- if ( argv[2] == INDEX ) tool = new IndexTool;
- if ( argv[2] == MERGE ) tool = new MergeTool;
- if ( argv[2] == RANDOM ) tool = new RandomTool;
- if ( argv[2] == SORT ) tool = new SortTool;
- if ( argv[2] == STATS ) tool = new StatsTool;
+ AbstractTool* tool = CreateTool( argv[2] );
+// if ( argv[2] == CONVERT ) tool = new ConvertTool;
+// if ( argv[2] == COUNT ) tool = new CountTool;
+// if ( argv[2] == COVERAGE ) tool = new CoverageTool;
+// if ( argv[2] == FILTER ) tool = new FilterTool;
+// if ( argv[2] == HEADER ) tool = new HeaderTool;
+// if ( argv[2] == INDEX ) tool = new IndexTool;
+// if ( argv[2] == MERGE ) tool = new MergeTool;
+// if ( argv[2] == RANDOM ) tool = new RandomTool;
+// if ( argv[2] == SORT ) tool = new SortTool;
+// if ( argv[2] == SPLIT ) tool = new SplitTool;
+// if ( argv[2] == STATS ) tool = new StatsTool;
// if tool known, print its help screen
if ( tool ) return tool->Help();
cerr << "\tmerge Merge multiple BAM files into single file" << endl;
cerr << "\trandom Select random alignments from existing BAM file(s)" << endl;
cerr << "\tsort Sorts the BAM file according to some criteria" << endl;
+ cerr << "\tsplit Splits a BAM file on user-specified property, creating a new BAM output file for each value found" << endl;
cerr << "\tstats Prints some basic statistics from input BAM file(s)" << endl;
cerr << endl;
cerr << "See 'bamtools help COMMAND' for more information on a specific command." << endl;
if ( (argv[1] == VERSION) || (argv[1] == LONG_VERSION) || (argv[1] == SHORT_VERSION) ) return Version();
// determine desired sub-tool
- AbstractTool* tool(0);
- if ( argv[1] == CONVERT ) tool = new ConvertTool;
- if ( argv[1] == COUNT ) tool = new CountTool;
- if ( argv[1] == COVERAGE ) tool = new CoverageTool;
- if ( argv[1] == FILTER ) tool = new FilterTool;
- if ( argv[1] == HEADER ) tool = new HeaderTool;
- if ( argv[1] == INDEX ) tool = new IndexTool;
- if ( argv[1] == MERGE ) tool = new MergeTool;
- if ( argv[1] == RANDOM ) tool = new RandomTool;
- if ( argv[1] == SORT ) tool = new SortTool;
- if ( argv[1] == STATS ) tool = new StatsTool;
+ AbstractTool* tool = CreateTool( argv[1] );
+// if ( argv[1] == CONVERT ) tool = new ConvertTool;
+// if ( argv[1] == COUNT ) tool = new CountTool;
+// if ( argv[1] == COVERAGE ) tool = new CoverageTool;
+// if ( argv[1] == FILTER ) tool = new FilterTool;
+// if ( argv[1] == HEADER ) tool = new HeaderTool;
+// if ( argv[1] == INDEX ) tool = new IndexTool;
+// if ( argv[1] == MERGE ) tool = new MergeTool;
+// if ( argv[1] == RANDOM ) tool = new RandomTool;
+// if ( argv[1] == SORT ) tool = new SortTool;
+// if ( argv[1] == SPLIT ) tool = new SplitTool;
+// if ( argv[1] == STATS ) tool = new StatsTool;
// if found, run tool
if ( tool ) return tool->Run(argc, argv);
--- /dev/null
+// ***************************************************************************
+// bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 September 2010 (DB)
+// ---------------------------------------------------------------------------
+//
+// ***************************************************************************
+
+#include <ctime>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "bamtools_split.h"
+#include "bamtools_options.h"
+#include "bamtools_variant.h"
+#include "BamReader.h"
+#include "BamWriter.h"
+using namespace std;
+using namespace BamTools;
+
+namespace BamTools {
+
+ // string constants
+ static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
+ static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
+ static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
+ static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
+ static const string SPLIT_REFERENCE_TOKEN = ".REF_";
+
+ string GetTimestampString(void) {
+
+ // get human readable timestamp
+ time_t currentTime;
+ time(¤tTime);
+ stringstream timeStream("");
+ timeStream << ctime(¤tTime);
+
+ // convert whitespace to '_'
+ string timeString = timeStream.str();
+ size_t found = timeString.find(" ");
+ while (found != string::npos) {
+ timeString.replace(found, 1, "_");
+ found = timeString.find(" ", found+1);
+ }
+ return timeString;
+ }
+
+ // remove copy of filename without extension
+ // (so /path/to/file.txt becomes /path/to/file )
+ string RemoveFilenameExtension(const string& filename) {
+ size_t found = filename.rfind(".");
+ return filename.substr(0, found);
+ }
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// SplitSettings implementation
+
+struct SplitTool::SplitSettings {
+
+ // flags
+ bool HasInputFilename;
+ bool HasCustomOutputStub;
+ bool IsSplittingMapped;
+ bool IsSplittingPaired;
+ bool IsSplittingReference;
+ bool IsSplittingTag;
+
+ // string args
+ string CustomOutputStub;
+ string InputFilename;
+ string TagToSplit;
+
+ // constructor
+ SplitSettings(void)
+ : HasInputFilename(false)
+ , HasCustomOutputStub(false)
+ , IsSplittingMapped(false)
+ , IsSplittingPaired(false)
+ , IsSplittingReference(false)
+ , IsSplittingTag(false)
+ , CustomOutputStub("")
+ , InputFilename(Options::StandardIn())
+ , TagToSplit("")
+ { }
+};
+
+// ---------------------------------------------
+// SplitToolPrivate declaration
+
+class SplitTool::SplitToolPrivate {
+
+ // ctor & dtor
+ public:
+ SplitToolPrivate(SplitTool::SplitSettings* settings);
+ ~SplitToolPrivate(void);
+
+ // 'public' interface
+ public:
+ bool Run(void);
+
+ // internal methods
+ private:
+ void DetermineOutputFilenameStub(void);
+ bool OpenReader(void);
+ bool SplitMapped(void);
+ bool SplitPaired(void);
+ bool SplitReference(void);
+ bool SplitTag(void);
+ bool SplitTag_Int(BamAlignment& al);
+ bool SplitTag_UInt(BamAlignment& al);
+ bool SplitTag_Real(BamAlignment& al);
+ bool SplitTag_String(BamAlignment& al);
+
+ // data members
+ private:
+ SplitTool::SplitSettings* m_settings;
+ string m_outputFilenameStub;
+ BamReader m_reader;
+ string m_header;
+ RefVector m_references;
+};
+
+// constructor
+SplitTool::SplitToolPrivate::SplitToolPrivate(SplitTool::SplitSettings* settings)
+ : m_settings(settings)
+{ }
+
+// destructor
+SplitTool::SplitToolPrivate::~SplitToolPrivate(void) {
+ m_reader.Close();
+}
+
+void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
+
+ // if user supplied output filename stub, use that
+ if ( m_settings->HasCustomOutputStub )
+ m_outputFilenameStub = m_settings->CustomOutputStub;
+
+ // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
+ else if ( m_settings->HasInputFilename )
+ m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
+
+ // otherwise, user did not specify -stub, and input is coming from STDIN
+ // generate stub from timestamp
+ else m_outputFilenameStub = GetTimestampString();
+}
+
+bool SplitTool::SplitToolPrivate::OpenReader(void) {
+ if ( !m_reader.Open(m_settings->InputFilename) ) {
+ cerr << "ERROR: SplitTool could not open BAM file: " << m_settings->InputFilename << endl;
+ return false;
+ }
+ m_header = m_reader.GetHeaderText();
+ m_references = m_reader.GetReferenceData();
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::Run(void) {
+
+ // determine output stub
+ DetermineOutputFilenameStub();
+
+ // open up BamReader
+ if ( !OpenReader() ) return false;
+
+ // determine split type from settings
+ if ( m_settings->IsSplittingMapped ) return SplitMapped();
+ if ( m_settings->IsSplittingPaired ) return SplitPaired();
+ if ( m_settings->IsSplittingReference ) return SplitReference();
+ if ( m_settings->IsSplittingTag ) return SplitTag();
+
+ // if we get here, no property was specified
+ cerr << "No property given to split on... Please use -mapped, -paired, -reference, or -tag TAG to specifiy split behavior." << endl;
+ return false;
+}
+
+bool SplitTool::SplitToolPrivate::SplitMapped(void) {
+
+ // set up splitting data structure
+ map<bool, BamWriter*> outputFiles;
+ map<bool, BamWriter*>::iterator writerIter;
+
+ // iterate through alignments
+ BamAlignment al;
+ BamWriter* writer;
+ bool isCurrentAlignmentMapped;
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // see if bool value exists
+ isCurrentAlignmentMapped = al.IsMapped();
+ writerIter = outputFiles.find(isCurrentAlignmentMapped);
+
+ // if no writer associated with this value
+ if ( writerIter == outputFiles.end() ) {
+
+ // open new BamWriter
+ writer = new BamWriter;
+ const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN ) + ".bam";
+ writer->Open(outputFilename, m_header, m_references);
+
+ // store in map
+ outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
+ }
+
+ // else grab corresponding writer
+ else writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if ( writer )
+ writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ map<bool, BamWriter*>::iterator writerEnd = outputFiles.end();
+ for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) {
+ BamWriter* writer = (*writerIter).second;
+ if ( writer == 0 ) continue;
+ writer->Close();
+ delete writer;
+ writer = 0;
+ }
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitPaired(void) {
+
+ // set up splitting data structure
+ map<bool, BamWriter*> outputFiles;
+ map<bool, BamWriter*>::iterator writerIter;
+
+ // iterate through alignments
+ BamAlignment al;
+ BamWriter* writer;
+ bool isCurrentAlignmentPaired;
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // see if bool value exists
+ isCurrentAlignmentPaired = al.IsPaired();
+ writerIter = outputFiles.find(isCurrentAlignmentPaired);
+
+ // if no writer associated with this value
+ if ( writerIter == outputFiles.end() ) {
+
+ // open new BamWriter
+ writer = new BamWriter;
+ const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN ) + ".bam";
+ writer->Open(outputFilename, m_header, m_references);
+
+ // store in map
+ outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
+ }
+
+ // else grab corresponding writer
+ else writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if ( writer )
+ writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ map<bool, BamWriter*>::iterator writerEnd = outputFiles.end();
+ for ( writerIter = outputFiles.begin() ; writerIter != writerEnd; ++writerIter ) {
+ BamWriter* writer = (*writerIter).second;
+ if (writer == 0 ) continue;
+ writer->Close();
+ delete writer;
+ writer = 0;
+ }
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitReference(void) {
+
+ // set up splitting data structure
+ map<int32_t, BamWriter*> outputFiles;
+ map<int32_t, BamWriter*>::iterator writerIter;
+
+ // iterate through alignments
+ BamAlignment al;
+ BamWriter* writer;
+ int32_t currentRefId;
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // see if bool value exists
+ currentRefId = al.RefID;
+ writerIter = outputFiles.find(currentRefId);
+
+ // if no writer associated with this value
+ if ( writerIter == outputFiles.end() ) {
+
+ // open new BamWriter
+ writer = new BamWriter;
+ const string refName = m_references.at(currentRefId).RefName;
+ const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
+ writer->Open(outputFilename, m_header, m_references);
+
+ // store in map
+ outputFiles.insert( make_pair(currentRefId, writer) );
+ }
+
+ // else grab corresponding writer
+ else writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if ( writer )
+ writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ map<int32_t, BamWriter*>::iterator writerEnd = outputFiles.end();
+ for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
+ BamWriter* writer = (*writerIter).second;
+ if (writer == 0 ) continue;
+ writer->Close();
+ delete writer;
+ writer = 0;
+ }
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitTag(void) {
+
+ // iterate through alignments, until we hit TAG
+ BamAlignment al;
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // look for tag in this alignment and get tag type
+ char tagType(0);
+ if ( !al.GetTagType(m_settings->TagToSplit, tagType) ) continue;
+
+ // request split method based on tag type
+ // pass it the current alignment found
+ switch (tagType) {
+
+ case 'c' :
+ case 's' :
+ case 'i' :
+ return SplitTag_Int(al);
+
+ case 'C' :
+ case 'S' :
+ case 'I' :
+ return SplitTag_UInt(al);
+
+ case 'f' :
+ return SplitTag_Real(al);
+
+ case 'A':
+ case 'Z':
+ case 'H':
+ return SplitTag_String(al);
+
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", tagType);
+ return false;
+ }
+ }
+
+ // tag not found, but that's not an error - return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitTag_Int(BamAlignment& al) {
+
+ // set up splitting data structure
+ map<int32_t, BamWriter*> outputFiles;
+ map<int32_t, BamWriter*>::iterator writerIter;
+
+ // local variables
+ const string tag = m_settings->TagToSplit;
+ BamWriter* writer;
+ stringstream outputFilenameStream("");
+ int32_t currentValue;
+
+ // retrieve first alignment tag value
+ if ( al.GetTag(tag, currentValue) ) {
+
+ // open new BamWriter, save first alignment
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+ writer->SaveAlignment(al);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ }
+
+ // iterate through remaining alignments
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // skip if this alignment doesn't have TAG
+ if ( !al.GetTag(tag, currentValue) ) continue;
+
+ // look up tag value in map
+ writerIter = outputFiles.find(currentValue);
+
+ // if no writer associated with this value
+ if ( writerIter == outputFiles.end() ) {
+
+ // open new BamWriter
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ }
+
+ // else grab corresponding writer
+ else writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if ( writer )
+ writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ map<int32_t, BamWriter*>::iterator writerEnd = outputFiles.end();
+ for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
+ BamWriter* writer = (*writerIter).second;
+ if (writer == 0 ) continue;
+ writer->Close();
+ delete writer;
+ writer = 0;
+ }
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitTag_UInt(BamAlignment& al) {
+
+ // set up splitting data structure
+ map<uint32_t, BamWriter*> outputFiles;
+ map<uint32_t, BamWriter*>::iterator writerIter;
+
+ // local variables
+ const string tag = m_settings->TagToSplit;
+ BamWriter* writer;
+ stringstream outputFilenameStream("");
+ uint32_t currentValue;
+
+ int alignmentsIgnored = 0;
+
+ // retrieve first alignment tag value
+ if ( al.GetTag(tag, currentValue) ) {
+
+ // open new BamWriter, save first alignment
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+ writer->SaveAlignment(al);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ } else ++alignmentsIgnored;
+
+ // iterate through remaining alignments
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // skip if this alignment doesn't have TAG
+ if ( !al.GetTag(tag, currentValue) ) { ++alignmentsIgnored; continue; }
+
+ // look up tag value in map
+ writerIter = outputFiles.find(currentValue);
+
+ // if no writer associated with this value
+ if ( writerIter == outputFiles.end() ) {
+
+ // open new BamWriter
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ }
+
+ // else grab corresponding writer
+ else writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if ( writer )
+ writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ map<uint32_t, BamWriter*>::iterator writerEnd = outputFiles.end();
+ for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
+ BamWriter* writer = (*writerIter).second;
+ if (writer == 0 ) continue;
+ writer->Close();
+ delete writer;
+ writer = 0;
+ }
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitTag_Real(BamAlignment& al) {
+
+ // set up splitting data structure
+ map<float, BamWriter*> outputFiles;
+ map<float, BamWriter*>::iterator writerIter;
+
+ // local variables
+ const string tag = m_settings->TagToSplit;
+ BamWriter* writer;
+ stringstream outputFilenameStream("");
+ float currentValue;
+
+ // retrieve first alignment tag value
+ if ( al.GetTag(tag, currentValue) ) {
+
+ // open new BamWriter, save first alignment
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+ writer->SaveAlignment(al);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ }
+
+ // iterate through remaining alignments
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // skip if this alignment doesn't have TAG
+ if ( !al.GetTag(tag, currentValue) ) continue;
+
+ // look up tag value in map
+ writerIter = outputFiles.find(currentValue);
+
+ // if no writer associated with this value
+ if ( writerIter == outputFiles.end() ) {
+
+ // open new BamWriter
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ }
+
+ // else grab corresponding writer
+ else writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if ( writer )
+ writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ map<float, BamWriter*>::iterator writerEnd = outputFiles.end();
+ for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
+ BamWriter* writer = (*writerIter).second;
+ if (writer == 0 ) continue;
+ writer->Close();
+ delete writer;
+ writer = 0;
+ }
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitTag_String(BamAlignment& al) {
+
+ // set up splitting data structure
+ map<string, BamWriter*> outputFiles;
+ map<string, BamWriter*>::iterator writerIter;
+
+ // local variables
+ const string tag = m_settings->TagToSplit;
+ BamWriter* writer;
+ stringstream outputFilenameStream("");
+ string currentValue;
+
+ // retrieve first alignment tag value
+ if ( al.GetTag(tag, currentValue) ) {
+
+ // open new BamWriter, save first alignment
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+ writer->SaveAlignment(al);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ }
+
+ // iterate through remaining alignments
+ while ( m_reader.GetNextAlignment(al) ) {
+
+ // skip if this alignment doesn't have TAG
+ if ( !al.GetTag(tag, currentValue) ) continue;
+
+ // look up tag value in map
+ writerIter = outputFiles.find(currentValue);
+
+ // if no writer associated with this value
+ if ( writerIter == outputFiles.end() ) {
+
+ // open new BamWriter
+ writer = new BamWriter;
+ outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
+ writer->Open(outputFilenameStream.str(), m_header, m_references);
+
+ // store in map
+ outputFiles.insert( make_pair(currentValue, writer) );
+
+ // reset stream
+ outputFilenameStream.str("");
+ }
+
+ // else grab corresponding writer
+ else writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if ( writer )
+ writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ map<string, BamWriter*>::iterator writerEnd = outputFiles.end();
+ for ( writerIter = outputFiles.begin(); writerIter != writerEnd; ++writerIter ) {
+ BamWriter* writer = (*writerIter).second;
+ if (writer == 0 ) continue;
+ writer->Close();
+ delete writer;
+ writer = 0;
+ }
+
+ // return success
+ return true;
+}
+
+// ---------------------------------------------
+// SplitTool implementation
+
+SplitTool::SplitTool(void)
+ : AbstractTool()
+ , m_settings(new SplitSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename>] < -mapped | -paired | -reference | -tag <TAG> > ");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
+
+ OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
+ Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
+ Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
+ Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
+ Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
+ m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
+}
+
+SplitTool::~SplitTool(void) {
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int SplitTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int SplitTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize internal implementation
+ m_impl = new SplitToolPrivate(m_settings);
+
+ // run tool, return success/fail
+ if ( m_impl->Run() )
+ return 0;
+ else
+ return 1;
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_split.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 18 September 2010 (DB)
+// ---------------------------------------------------------------------------
+//
+// ***************************************************************************
+
+#ifndef BAMTOOLS_SPLIT_H
+#define BAMTOOLS_SPLIT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class SplitTool : public AbstractTool {
+
+ public:
+ SplitTool(void);
+ ~SplitTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct SplitSettings;
+ SplitSettings* m_settings;
+
+ struct SplitToolPrivate;
+ SplitToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_SPLIT_H
\ No newline at end of file
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 30 August 2010
+// Last modified: 18 September 2010
// ---------------------------------------------------------------------------
//
// ***************************************************************************
#include <iostream>
#include "bamtools_filter_engine.h"
-#include "BamAux.h"
using namespace std;
using namespace BamTools;
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 16 September 2010
+// Last modified: 18 September 2010
// ---------------------------------------------------------------------------
// Provides pileup at position functionality for various tools.
// ***************************************************************************
#define BAMTOOLS_PILEUP_ENGINE_H
#include <vector>
-#include "BamAux.h"
+#include "BamAlignment.h"
namespace BamTools {