From 049d6b2501ea9c9afeb8588013d5f536246a2ca8 Mon Sep 17 00:00:00 2001 From: Derek Date: Sat, 18 Sep 2010 16:43:37 -0400 Subject: [PATCH] Moved BamAlignment data structure out to its own .h/.cpp. BamAux.h was getting over-crowded. *NOTE - This means that if you were using the BamAlignment data structure in code without a reader/writer, you need to include BamAlignment.h instead of BamAux.h. If your code was using reader/writer, no changes should be necessary on your end. --- src/api/BamAlignment.cpp | 645 ++++++++++++++++++++++++++++ src/api/BamAlignment.h | 191 +++++++++ src/api/BamAux.h | 832 +----------------------------------- src/api/BamIndex.h | 4 +- src/api/BamMultiReader.cpp | 22 +- src/api/BamMultiReader.cpp~ | 450 +++++++++++++++++++ src/api/BamMultiReader.h | 25 +- src/api/BamReader.h | 14 +- src/api/BamWriter.h | 7 +- 9 files changed, 1332 insertions(+), 858 deletions(-) create mode 100644 src/api/BamAlignment.cpp create mode 100644 src/api/BamAlignment.h create mode 100644 src/api/BamMultiReader.cpp~ diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp new file mode 100644 index 0000000..3dbc7e9 --- /dev/null +++ b/src/api/BamAlignment.cpp @@ -0,0 +1,645 @@ +// *************************************************************************** +// BamAlignment.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 September 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#include +#include +#include +#include +#include +#include +#include +#include "BamAlignment.h" +using namespace BamTools; + +// default ctor +BamAlignment::BamAlignment(void) + : RefID(-1) + , Position(-1) + , MateRefID(-1) + , MatePosition(-1) + , InsertSize(0) +{ } + +// copy ctor +BamAlignment::BamAlignment(const BamAlignment& other) + : Name(other.Name) + , Length(other.Length) + , QueryBases(other.QueryBases) + , AlignedBases(other.AlignedBases) + , Qualities(other.Qualities) + , TagData(other.TagData) + , RefID(other.RefID) + , Position(other.Position) + , Bin(other.Bin) + , MapQuality(other.MapQuality) + , AlignmentFlag(other.AlignmentFlag) + , CigarData(other.CigarData) + , MateRefID(other.MateRefID) + , MatePosition(other.MatePosition) + , InsertSize(other.InsertSize) + , SupportData(other.SupportData) +{ } + +// dtor +BamAlignment::~BamAlignment(void) { } + +// Queries against alignment flags +bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } +bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } +bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } +bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } +bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } +bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } +bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } +bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } +bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } +bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } +bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + +// Manipulate alignment flags +void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } +void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } +void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } +void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } +void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } +void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } +void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } +void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } +void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } +void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } +void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } + +// calculates alignment end position, based on starting position and CIGAR operations +int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { + + // initialize alignment end to starting position + int alignEnd = Position; + + // iterate over cigar operations + std::vector::const_iterator cigarIter = CigarData.begin(); + std::vector::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const char cigarType = (*cigarIter).Type; + if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) + alignEnd += (*cigarIter).Length; + else if ( usePadded && cigarType == 'I' ) + alignEnd += (*cigarIter).Length; + } + + // adjust for zeroBased, if necessary + if (zeroBased) + return alignEnd - 1; + else + return alignEnd; +} + +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, copy tag data to temp buffer + std::string newTag = tag + type + value; + const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + + // copy original tag data to temp buffer + std::string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { + return AddTag(tag, type, (const uint32_t&)value); +} + +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + + // copy original tag data to temp buffer + std::string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + value.size()]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + const unsigned int dataLength = strlen(value.c_str()); + memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { + return EditTag(tag, type, (const uint32_t&)value); +} + +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +// get "NM" tag data - originally contributed by Aaron Quinlan +// stores data in 'editDistance', returns success/fail +bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { + return GetTag("NM", (uint32_t&)editDistance); +} + +// get "RG" tag data +// stores data in 'readGroup', returns success/fail +bool BamAlignment::GetReadGroup(std::string& readGroup) const { + return GetTag("RG", readGroup); +} + +bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + const unsigned int dataLength = strlen(pTagData); + destination.clear(); + destination.resize(dataLength); + memcpy( (char*)destination.data(), pTagData, dataLength ); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch (type) { + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type for integer destination (float or var-length strings) + case 'f': + case 'Z': + case 'H': + fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + + // store in destination + destination = 0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { + return GetTag(tag, (uint32_t&)destination); +} + +bool BamAlignment::GetTag(const std::string& tag, float& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + //pTagData += numBytesParsed; + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch(type) { + + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'f': + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type (var-length strings) + case 'Z': + case 'H': + fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + + // store in destination + destination = 0.0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::RemoveTag(const std::string& tag) { + + // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed + // also, return false if no data present to remove + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + char newTagData[originalTagDataLength]; + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); + + // save new tag data + TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); + return true; + } + + // tag not found, no removal - return failure + return false; +} + +bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) { + + while ( numBytesParsed < tagDataLength ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag, return true on match + if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) + return true; + + // get the storage class and find the next tag + if ( *pTagStorageType == '\0' ) return false; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; + if ( *pTagData == '\0' ) return false; + } + + // checked all tags, none match + return false; +} + +bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { + + switch(storageType) { + + case 'A': + case 'c': + case 'C': + ++numBytesParsed; + ++pTagData; + break; + + case 's': + case 'S': + numBytesParsed += 2; + pTagData += 2; + break; + + case 'f': + case 'i': + case 'I': + numBytesParsed += 4; + pTagData += 4; + break; + + case 'Z': + case 'H': + while(*pTagData) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + default: + // error case + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType); + return false; + } + + // return success + return true; +} \ No newline at end of file diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h new file mode 100644 index 0000000..ac50552 --- /dev/null +++ b/src/api/BamAlignment.h @@ -0,0 +1,191 @@ +// *************************************************************************** +// BamAlignment.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 September 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#ifndef BAMALIGNMENT_H +#define BAMALIGNMENT_H + +#include +#include +#include "BamAux.h" + +namespace BamTools { + +// BamAlignment data structure +// explicitly labeled as 'struct' to indicate that (most of) its fields are public +struct BamAlignment { + + // constructors & destructor + public: + BamAlignment(void); + BamAlignment(const BamAlignment& other); + ~BamAlignment(void); + + // Queries against alignment flags + public: + bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate + bool IsFailedQC(void) const; // Returns true if this read failed quality control + bool IsFirstMate(void) const; // Returns true if alignment is first mate on read + bool IsMapped(void) const; // Returns true if alignment is mapped + bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped + bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand + bool IsPaired(void) const; // Returns true if alignment part of paired-end read + bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment + bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution + bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand + bool IsSecondMate(void) const; // Returns true if alignment is second mate on read + + // Manipulate alignment flags + public: + void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag + void SetIsFailedQC(bool ok); // Sets "failed quality control" flag + void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag + void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag + void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag + void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag + void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag + void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag + void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag + void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag + void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag + + // Tag data access methods + public: + // ------------------------------------------------------------------------------------- + // N.B. - The following tag access methods may not be used on BamAlignments fetched + // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in + // error message (to keep output clean) but will ALWAYS return false. Only user-created + // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here. + + // add tag data (create new TAG entry with TYPE and VALUE) + // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details + // returns true if new data added, false if error or TAG already exists + // N.B. - will NOT modify existing tag. Use EditTag() instead + // @tag - two character tag name + // @type - single character tag type (see SAM/BAM spec for details) + // @value - value to associate with tag + bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H + bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i + bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i + bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f + + // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) + // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details + // returns true if edit was successfaul, false if error + // @tag - two character tag name + // @type - single character tag type (see SAM/BAM spec for details) + // @value - new value for tag + bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H + bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i + bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i + bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f + + // specific tag data access methods - these only remain for legacy support + // returns whether specific tag could be retrieved + bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance)) + bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup)) + + // generic tag data access methods + // returns whether tag is found & tag type is compatible with DESTINATION + // @tag - two character tag name + // @destination - if found, tag value is stored here + bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings + bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data + bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data + bool GetTag(const std::string& tag, float& destination) const; // access floating point data + + // remove tag data + // returns true if removal was successful, false if error + // N.B. - returns false if TAG does not exist (no removal can occur) + // @tag - two character tag name + bool RemoveTag(const std::string& tag); + + // Additional data access methods + public: + // calculates & returns alignment end position, based on starting position and CIGAR operations + // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference + // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats. + int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; + + // 'internal' utility methods + private: + static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); + static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); + + // Data members + public: + std::string Name; // Read name + int32_t Length; // Query length + std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) + std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) + std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) + std::string TagData; // Tag data (accessor methods will pull the requested information out) + int32_t RefID; // ID number for reference sequence + int32_t Position; // Position (0-based) where alignment starts + uint16_t Bin; // Bin in BAM file where this alignment resides + uint16_t MapQuality; // Mapping quality score + uint32_t AlignmentFlag; // Alignment bit-flag - see Is() methods to query this value, SetIs() methods to manipulate + std::vector CigarData; // CIGAR operations for this alignment + int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned + int32_t MatePosition; // Position (0-based) where alignment's mate starts + int32_t InsertSize; // Mate-pair insert size + + // internal data + private: + struct BamAlignmentSupportData { + + // data members + std::string AllCharData; + uint32_t BlockLength; + uint32_t NumCigarOperations; + uint32_t QueryNameLength; + uint32_t QuerySequenceLength; + bool HasCoreOnly; + + // constructor + BamAlignmentSupportData(void) + : BlockLength(0) + , NumCigarOperations(0) + , QueryNameLength(0) + , QuerySequenceLength(0) + , HasCoreOnly(false) + { } + }; + + // contains raw character data & lengths + BamAlignmentSupportData SupportData; + + // allow these classes access to BamAlignment private members (SupportData) + // but client code should not need to touch this data + friend class BamReader; + friend class BamWriter; + + // Alignment flag query constants + // Use the get/set methods above instead + private: + enum { PAIRED = 1 + , PROPER_PAIR = 2 + , UNMAPPED = 4 + , MATE_UNMAPPED = 8 + , REVERSE = 16 + , MATE_REVERSE = 32 + , READ_1 = 64 + , READ_2 = 128 + , SECONDARY = 256 + , QC_FAILED = 512 + , DUPLICATE = 1024 + }; +}; + +// convenience typedef(s) +typedef std::vector BamAlignmentVector; + +} // namespace BamTools + +#endif // BAMALIGNMENT_H diff --git a/src/api/BamAux.h b/src/api/BamAux.h index 84223a6..3eff2d7 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -3,30 +3,24 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 September 2010 (DB) +// Last modified: 18 September 2010 (DB) // --------------------------------------------------------------------------- -// Provides the basic constants, data structures, etc. for using BAM files +// Provides the basic constants, data structures, utilities etc. +// used throughout the API for handling BAM files // *************************************************************************** #ifndef BAMAUX_H #define BAMAUX_H -// C inclues -#include -#include -#include -#include - -// C++ includes -#include -#include +#include #include -#include #include -#include #include +// ---------------------------------------------------------------- +// ---------------------------------------------------------------- // Platform-specific type definitions + #ifndef BAMTOOLS_TYPES #define BAMTOOLS_TYPES #ifdef _MSC_VER @@ -45,7 +39,10 @@ namespace BamTools { +// ---------------------------------------------------------------- +// ---------------------------------------------------------------- // BAM constants + const int BAM_CMATCH = 0; const int BAM_CINS = 1; const int BAM_CDEL = 2; @@ -58,163 +55,11 @@ const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); const int BAM_CORE_SIZE = 32; const int BT_SIZEOF_INT = 4; -struct CigarOp; - -struct BamAlignment { - - // constructors & destructor - public: - BamAlignment(void); - BamAlignment(const BamAlignment& other); - ~BamAlignment(void); - - // Queries against alignment flags - public: - bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate - bool IsFailedQC(void) const; // Returns true if this read failed quality control - bool IsFirstMate(void) const; // Returns true if alignment is first mate on read - bool IsMapped(void) const; // Returns true if alignment is mapped - bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped - bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand - bool IsPaired(void) const; // Returns true if alignment part of paired-end read - bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment - bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution - bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand - bool IsSecondMate(void) const; // Returns true if alignment is second mate on read - - // Manipulate alignment flags - public: - void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag - void SetIsFailedQC(bool ok); // Sets "failed quality control" flag - void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag - void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag - void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag - void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag - void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag - void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag - void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag - void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag - void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag - - // Tag data access methods - public: - // ------------------------------------------------------------------------------------- - // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched - // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in - // error message (to keep output clean) but will ALWAYS return false. Only user- - // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid. - - // add tag data (create new TAG entry with TYPE and VALUE) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if new data added, false if error or TAG already exists - // N.B. - will NOT modify existing tag. Use EditTag() instead - bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if edit was successfaul, false if error - bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // specific tag data access methods - these only remain for legacy support - bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance)) - bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) - - // generic tag data access methods - bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings - bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data - bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data - bool GetTag(const std::string& tag, float& destination) const; // access floating point data - - // remove tag data - // returns true if removal was successful, false if error - // N.B. - returns false if TAG does not exist (no removal can occur) - bool RemoveTag(const std::string& tag); - - // Additional data access methods - public: - // calculates alignment end position, based on starting position and CIGAR operations - // @zeroBased - if true, returns 0-based coordinate; else returns 1-based - int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; - - // 'internal' utility methods - private: - static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); - static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); - - // Data members - public: - std::string Name; // Read name - int32_t Length; // Query length - std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) - std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) - std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) - std::string TagData; // Tag data (accessor methods will pull the requested information out) - int32_t RefID; // ID number for reference sequence - int32_t Position; // Position (0-based) where alignment starts - uint16_t Bin; // Bin in BAM file where this alignment resides - uint16_t MapQuality; // Mapping quality score - uint32_t AlignmentFlag; // Alignment bit-flag - see Is() methods to query this value, SetIs() methods to manipulate - std::vector CigarData; // CIGAR operations for this alignment - int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned - int32_t MatePosition; // Position (0-based) where alignment's mate starts - int32_t InsertSize; // Mate-pair insert size - - // internal data - private: - struct BamAlignmentSupportData { - - // data members - std::string AllCharData; - uint32_t BlockLength; - uint32_t NumCigarOperations; - uint32_t QueryNameLength; - uint32_t QuerySequenceLength; - bool HasCoreOnly; - - // constructor - BamAlignmentSupportData(void) - : BlockLength(0) - , NumCigarOperations(0) - , QueryNameLength(0) - , QuerySequenceLength(0) - , HasCoreOnly(false) - { } - }; - - // contains raw character data & lengths - BamAlignmentSupportData SupportData; - - // allow these classes access to BamAlignment private members (SupportData) - // but client code should not need to touch this data - friend class BamReader; - friend class BamWriter; - - // Alignment flag query constants - // Use the get/set methods above instead - private: - enum { PAIRED = 1 - , PROPER_PAIR = 2 - , UNMAPPED = 4 - , MATE_UNMAPPED = 8 - , REVERSE = 16 - , MATE_REVERSE = 32 - , READ_1 = 64 - , READ_2 = 128 - , SECONDARY = 256 - , QC_FAILED = 512 - , DUPLICATE = 1024 - }; -}; - // ---------------------------------------------------------------- -// Auxiliary data structs & typedefs +// ---------------------------------------------------------------- +// Data structs & typedefs +// CIGAR operation data structure struct CigarOp { // data members @@ -229,6 +74,7 @@ struct CigarOp { { } }; +// Reference data entry struct RefData { // data members @@ -243,10 +89,9 @@ struct RefData { , RefHasAlignments(ok) { } }; +typedef std::vector RefVector; -typedef std::vector RefVector; -typedef std::vector BamAlignmentVector; - +// General (sequential) genome region struct BamRegion { // data members @@ -274,9 +119,8 @@ struct BamRegion { }; // ---------------------------------------------------------------- -// Added: 3-35-2010 DWB -// Fixed: Routines to provide endian-correctness // ---------------------------------------------------------------- +// General utilities // returns true if system is big endian inline bool SystemIsBigEndian(void) { @@ -353,652 +197,12 @@ inline void SwapEndian_64p(char* data) { SwapEndian_64(value); } +// returns whether file exists (can be opened OK) inline bool FileExists(const std::string& filename) { std::ifstream f(filename.c_str(), std::ifstream::in); return !f.fail(); } -// ---------------------------------------------------------------- -// BamAlignment member methods - -// constructors & destructor -inline BamAlignment::BamAlignment(void) { } - -inline BamAlignment::BamAlignment(const BamAlignment& other) - : Name(other.Name) - , Length(other.Length) - , QueryBases(other.QueryBases) - , AlignedBases(other.AlignedBases) - , Qualities(other.Qualities) - , TagData(other.TagData) - , RefID(other.RefID) - , Position(other.Position) - , Bin(other.Bin) - , MapQuality(other.MapQuality) - , AlignmentFlag(other.AlignmentFlag) - , CigarData(other.CigarData) - , MateRefID(other.MateRefID) - , MatePosition(other.MatePosition) - , InsertSize(other.InsertSize) - , SupportData(other.SupportData) -{ } - -inline BamAlignment::~BamAlignment(void) { } - -// Queries against alignment flags -inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } -inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } -inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } -inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } -inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } -inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } -inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } -inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } -inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } -inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } -inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - -// Manipulate alignment flags -inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } -inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } -inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } -inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } -inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } -inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } -inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } -inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } -inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } -inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } -inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } - -// calculates alignment end position, based on starting position and CIGAR operations -inline -int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { - - // initialize alignment end to starting position - int alignEnd = Position; - - // iterate over cigar operations - std::vector::const_iterator cigarIter = CigarData.begin(); - std::vector::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { - alignEnd += (*cigarIter).Length; - } - else if ( usePadded && cigarType == 'I' ) { - alignEnd += (*cigarIter).Length; - } - } - - // adjust for zeroBased, if necessary - if (zeroBased) - return alignEnd - 1; - else - return alignEnd; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, copy tag data to temp buffer - std::string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - - // copy original tag data to temp buffer - std::string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { - return AddTag(tag, type, (const uint32_t&)value); -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - - // copy original tag data to temp buffer - std::string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + value.size()]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - const unsigned int dataLength = strlen(value.c_str()); - memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { - return EditTag(tag, type, (const uint32_t&)value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -// get "NM" tag data - originally contributed by Aaron Quinlan -// stores data in 'editDistance', returns success/fail -inline -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { - return GetTag("NM", (uint32_t&)editDistance); -} - -// get "RG" tag data -// stores data in 'readGroup', returns success/fail -inline -bool BamAlignment::GetReadGroup(std::string& readGroup) const { - return GetTag("RG", readGroup); -} - -inline -bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - const unsigned int dataLength = strlen(pTagData); - destination.clear(); - destination.resize(dataLength); - memcpy( (char*)destination.data(), pTagData, dataLength ); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type for integer destination (float or var-length strings) - case 'f': - case 'Z': - case 'H': - fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { - return GetTag(tag, (uint32_t&)destination); -} - -inline -bool BamAlignment::GetTag(const std::string& tag, float& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - //pTagData += numBytesParsed; - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch(type) { - - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'f': - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type (var-length strings) - case 'Z': - case 'H': - fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0.0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::RemoveTag(const std::string& tag) { - - // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed - // also, return false if no data present to remove - if ( SupportData.HasCoreOnly || TagData.empty() ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - char newTagData[originalTagDataLength]; - - // copy original tag data up til desired tag - pTagData -= 3; - numBytesParsed -= 3; - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); - - // save new tag data - TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); - return true; - } - - // tag not found, no removal - return failure - return false; -} - -inline -bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) { - - while ( numBytesParsed < tagDataLength ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag, return true on match - if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) - return true; - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - - // checked all tags, none match - return false; -} - -inline -bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - - switch(storageType) { - - case 'A': - case 'c': - case 'C': - ++numBytesParsed; - ++pTagData; - break; - - case 's': - case 'S': - numBytesParsed += 2; - pTagData += 2; - break; - - case 'f': - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; - - case 'Z': - case 'H': - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - // increment for null-terminator - ++numBytesParsed; - ++pTagData; - break; - - default: - // error case - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType); - return false; - } - - // return success - return true; -} - } // namespace BamTools #endif // BAMAUX_H diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index fdd0d13..710893a 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 September 2010 (DB) +// Last modified: 18 September 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the standardized BAM index format // (".bai") as well as a BamTools-specific (nonstandard) index format (".bti"). @@ -15,7 +15,7 @@ #include #include #include -#include "BamAux.h" +#include "BamAlignment.h" namespace BamTools { diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index 8590516..a81ac87 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -1,9 +1,9 @@ // *************************************************************************** -// BamMultiReader.cpp (c) 2010 Erik Garrison +// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 (DB) +// Last modified: 18 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -16,7 +16,6 @@ // precludes the need to sort merged files. // *************************************************************************** -// C++ includes #include #include #include @@ -24,8 +23,6 @@ #include #include #include - -// BamTools includes #include "BGZF.h" #include "BamMultiReader.h" using namespace BamTools; @@ -74,11 +71,11 @@ void BamMultiReader::Close(void) { } // saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail -bool BamMultiReader::CreateIndexes(bool useDefaultIndex) { +bool BamMultiReader::CreateIndexes(bool useStandardIndex) { bool result = true; for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; - result &= reader->CreateIndex(useDefaultIndex); + result &= reader->CreateIndex(useStandardIndex); } return result; } @@ -99,11 +96,12 @@ const string BamMultiReader::GetHeaderText(void) const { // foreach extraction entry (each BAM file) for (vector >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { - map currentFileReadGroups; - BamReader* reader = rs->first; - - stringstream header(reader->GetHeaderText()); + string headerText = reader->GetHeaderText(); + if ( headerText.empty() ) continue; + + map currentFileReadGroups; + stringstream header(headerText); vector lines; string item; while (getline(header, item)) @@ -290,7 +288,7 @@ bool BamMultiReader::Jump(int refID, int position) { } // opens BAM files -bool BamMultiReader::Open(const vector filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { +bool BamMultiReader::Open(const vector& filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { // for filename in filenames fileNames = filenames; // save filenames in our multireader diff --git a/src/api/BamMultiReader.cpp~ b/src/api/BamMultiReader.cpp~ new file mode 100644 index 0000000..b5c4002 --- /dev/null +++ b/src/api/BamMultiReader.cpp~ @@ -0,0 +1,450 @@ +// *************************************************************************** +// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 September 2010 (DB) +// --------------------------------------------------------------------------- +// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files. +// +// This functionality allows applications to work on very large sets of files +// without requiring intermediate merge, sort, and index steps for each file +// subset. It also improves the performance of our merge system as it +// precludes the need to sort merged files. +// *************************************************************************** + +#include +#include +#include +#include +#include +#include +#include +#include "BGZF.h" +#include "BamMultiReader.h" +using namespace BamTools; +using namespace std; + +// ----------------------------------------------------- +// BamMultiReader implementation +// ----------------------------------------------------- + +// constructor +BamMultiReader::BamMultiReader(void) + : CurrentRefID(0) + , CurrentLeft(0) +{ } + +// destructor +BamMultiReader::~BamMultiReader(void) { + Close(); +} + +// close the BAM files +void BamMultiReader::Close(void) { + + // close all BAM readers and clean up pointers + vector >::iterator readerIter = readers.begin(); + vector >::iterator readerEnd = readers.end(); + for ( ; readerIter != readerEnd; ++readerIter) { + + BamReader* reader = (*readerIter).first; + BamAlignment* alignment = (*readerIter).second; + + // close the reader + if ( reader) reader->Close(); + + // delete reader pointer + delete reader; + reader = 0; + + // delete alignment pointer + delete alignment; + alignment = 0; + } + + // clear out the container + readers.clear(); +} + +// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail +bool BamMultiReader::CreateIndexes(bool useStandardIndex) { + bool result = true; + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + result &= reader->CreateIndex(useStandardIndex); + } + return result; +} + +// for debugging +void BamMultiReader::DumpAlignmentIndex(void) { + for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { + cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; + } +} + +// makes a virtual, unified header for all the bam files in the multireader +const string BamMultiReader::GetHeaderText(void) const { + + string mergedHeader = ""; + map readGroups; + + // foreach extraction entry (each BAM file) + for (vector >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { + + BamReader* reader = rs->first; + const string headerText = reader->GetHeaderText(); + if ( headerText.empty() ) continue; + + map currentFileReadGroups; + stringstream header(headerText); + vector lines; + string item; + while (getline(header, item)) + lines.push_back(item); + + for (vector::const_iterator it = lines.begin(); it != lines.end(); ++it) { + + // get next line from header, skip if empty + string headerLine = *it; + if ( headerLine.empty() ) { continue; } + + // if first file, save HD & SQ entries + if ( rs == readers.begin() ) { + if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { + mergedHeader.append(headerLine.c_str()); + mergedHeader.append(1, '\n'); + } + } + + // (for all files) append RG entries if they are unique + if ( headerLine.find("@RG") == 0 ) { + stringstream headerLineSs(headerLine); + string part, readGroupPart, readGroup; + while(std::getline(headerLineSs, part, '\t')) { + stringstream partSs(part); + string subtag; + std::getline(partSs, subtag, ':'); + if (subtag == "ID") { + std::getline(partSs, readGroup, ':'); + break; + } + } + if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries + mergedHeader.append(headerLine.c_str() ); + mergedHeader.append(1, '\n'); + readGroups[readGroup] = true; + currentFileReadGroups[readGroup] = true; + } else { + // warn iff we are reading one file and discover duplicated @RG tags in the header + // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags + if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { + cerr << "WARNING: duplicate @RG tag " << readGroup + << " entry in header of " << reader->GetFilename() << endl; + } + } + } + } + } + + // return merged header text + return mergedHeader; +} + +// get next alignment among all files +bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { + + // bail out if we are at EOF in all files, means no more alignments to process + if (!HasOpenReaders()) + return false; + + // when all alignments have stepped into a new target sequence, update our + // current reference sequence id + UpdateReferenceID(); + + // our lowest alignment and reader will be at the front of our alignment index + BamAlignment* alignment = alignments.begin()->second.second; + BamReader* reader = alignments.begin()->second.first; + + // now that we have the lowest alignment in the set, save it by copy to our argument + nextAlignment = BamAlignment(*alignment); + + // remove this alignment index entry from our alignment index + alignments.erase(alignments.begin()); + + // and add another entry if we can get another alignment from the reader + if (reader->GetNextAlignment(*alignment)) { + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { // do nothing + //cerr << "reached end of file " << lowestReader->GetFilename() << endl; + } + + return true; + +} + +// get next alignment among all files without parsing character data from alignments +bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { + + // bail out if we are at EOF in all files, means no more alignments to process + if (!HasOpenReaders()) + return false; + + // when all alignments have stepped into a new target sequence, update our + // current reference sequence id + UpdateReferenceID(); + + // our lowest alignment and reader will be at the front of our alignment index + BamAlignment* alignment = alignments.begin()->second.second; + BamReader* reader = alignments.begin()->second.first; + + // now that we have the lowest alignment in the set, save it by copy to our argument + nextAlignment = BamAlignment(*alignment); + //memcpy(&nextAlignment, alignment, sizeof(BamAlignment)); + + // remove this alignment index entry from our alignment index + alignments.erase(alignments.begin()); + + // and add another entry if we can get another alignment from the reader + if (reader->GetNextAlignmentCore(*alignment)) { + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { // do nothing + //cerr << "reached end of file " << lowestReader->GetFilename() << endl; + } + + return true; + +} + +// --------------------------------------------------------------------------------------- +// +// NB: The following GetReferenceX() functions assume that we have identical +// references for all BAM files. We enforce this by invoking the above +// validation function (ValidateReaders) to verify that our reference data +// is the same across all files on Open, so we will not encounter a situation +// in which there is a mismatch and we are still live. +// +// --------------------------------------------------------------------------------------- + +// returns the number of reference sequences +const int BamMultiReader::GetReferenceCount(void) const { + return readers.front().first->GetReferenceCount(); +} + +// returns vector of reference objects +const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { + return readers.front().first->GetReferenceData(); +} + +// returns refID from reference name +const int BamMultiReader::GetReferenceID(const string& refName) const { + return readers.front().first->GetReferenceID(refName); +} + +// --------------------------------------------------------------------------------------- + +// checks if any readers still have alignments +bool BamMultiReader::HasOpenReaders() { + return alignments.size() > 0; +} + +// returns whether underlying BAM readers ALL have an index loaded +// this is useful to indicate whether Jump() or SetRegion() are possible +bool BamMultiReader::IsIndexLoaded(void) const { + bool ok = true; + vector >::const_iterator readerIter = readers.begin(); + vector >::const_iterator readerEnd = readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader ) ok &= reader->IsIndexLoaded(); + } + return ok; +} + +// jumps to specified region(refID, leftBound) in BAM files, returns success/fail +bool BamMultiReader::Jump(int refID, int position) { + + //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { + CurrentRefID = refID; + CurrentLeft = position; + + bool result = true; + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + result &= reader->Jump(refID, position); + if (!result) { + cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; + exit(1); + } + } + if (result) UpdateAlignments(); + return result; +} + +// opens BAM files +bool BamMultiReader::Open(const vector& filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { + + // for filename in filenames + fileNames = filenames; // save filenames in our multireader + for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { + + const string filename = *it; + BamReader* reader = new BamReader; + + bool openedOK = true; + if (openIndexes) { + + // leave index filename empty + // this allows BamReader & BamIndex to search for any available + // useDefaultIndex gives hint to prefer BAI over BTI + openedOK = reader->Open(filename, "", true, useDefaultIndex); + } + + // ignoring index file(s) + else openedOK = reader->Open(filename); + + // if file opened ok, check that it can be read + if ( openedOK ) { + + bool fileOK = true; + BamAlignment* alignment = new BamAlignment; + fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) ); + + if (fileOK) { + readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { + cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; + // if only file available & could not be read, return failure + if ( filenames.size() == 1 ) return false; + } + } + + // TODO; any further error handling when openedOK is false ?? + else + return false; + } + + // files opened ok, at least one alignment could be read, + // now need to check that all files use same reference data + ValidateReaders(); + return true; +} + +void BamMultiReader::PrintFilenames(void) { + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + cout << reader->GetFilename() << endl; + } +} + +// returns BAM file pointers to beginning of alignment data +bool BamMultiReader::Rewind(void) { + bool result = true; + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + result &= reader->Rewind(); + } + return result; +} + +bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { + BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); + return SetRegion(region); +} + +bool BamMultiReader::SetRegion(const BamRegion& region) { + + Region = region; + + // NB: While it may make sense to track readers in which we can + // successfully SetRegion, In practice a failure of SetRegion means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + if (!it->first->SetRegion(region)) { + cerr << "ERROR: could not jump " << it->first->GetFilename() << " to " + << region.LeftRefID << ":" << region.LeftPosition + << ".." << region.RightRefID << ":" << region.RightPosition << endl; + } + } + + UpdateAlignments(); + return true; +} + +void BamMultiReader::UpdateAlignments(void) { + // Update Alignments + alignments.clear(); + for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* br = it->first; + BamAlignment* ba = it->second; + if (br->GetNextAlignment(*ba)) { + alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), + make_pair(br, ba))); + } else { + // assume BamReader end of region / EOF + } + } +} + +// updates the reference id stored in the BamMultiReader +// to reflect the current state of the readers +void BamMultiReader::UpdateReferenceID(void) { + // the alignments are sorted by position, so the first alignment will always have the lowest reference ID + if (alignments.begin()->second.second->RefID != CurrentRefID) { + // get the next reference id + // while there aren't any readers at the next ref id + // increment the ref id + int nextRefID = CurrentRefID; + while (alignments.begin()->second.second->RefID != nextRefID) { + ++nextRefID; + } + //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; + CurrentRefID = nextRefID; + } +} + +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +void BamMultiReader::ValidateReaders(void) const { + int firstRefCount = readers.front().first->GetReferenceCount(); + BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); + for (vector >::const_iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + BamTools::RefVector currentRefData = reader->GetReferenceData(); + BamTools::RefVector::const_iterator f = firstRefData.begin(); + BamTools::RefVector::const_iterator c = currentRefData.begin(); + if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { + cerr << "ERROR: mismatched number of references in " << reader->GetFilename() + << " expected " << firstRefCount + << " reference sequences but only found " << reader->GetReferenceCount() << endl; + exit(1); + } + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while (f != firstRefData.end()) { + if (f->RefName != c->RefName || f->RefLength != c->RefLength) { + cerr << "ERROR: mismatched references found in " << reader->GetFilename() + << " expected: " << endl; + for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + cerr << "but found: " << endl; + for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + exit(1); + } + ++f; ++c; + } + } +} diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index e2c1dad..e1a6e94 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -1,9 +1,9 @@ // *************************************************************************** -// BamMultiReader.h (c) 2010 Erik Garrison +// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 (DB) +// Last modified: 18 September 2010 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // *************************************************************************** @@ -11,23 +11,16 @@ #ifndef BAMMULTIREADER_H #define BAMMULTIREADER_H -// C++ includes #include #include -#include // for pair +#include #include - -using namespace std; - -// BamTools includes -#include "BamAux.h" #include "BamReader.h" namespace BamTools { // index mapping reference/position pairings to bamreaders and their alignments -typedef multimap, pair > AlignmentIndex; - +typedef std::multimap, std::pair > AlignmentIndex; class BamMultiReader { @@ -61,7 +54,7 @@ class BamMultiReader { // also useful for merging // @preferStandardIndex - look for standard BAM index ".bai" first. If false, // will look for BamTools index ".bti". - bool Open(const vector filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false); + bool Open(const std::vector& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false); // returns whether underlying BAM readers ALL have an index loaded // this is useful to indicate whether Jump() or SetRegion() are possible @@ -97,7 +90,7 @@ class BamMultiReader { // ---------------------- // returns unified SAM header text for all files - const string GetHeaderText(void) const; + const std::string GetHeaderText(void) const; // returns number of reference sequences const int GetReferenceCount(void) const; // returns vector of reference objects @@ -112,7 +105,7 @@ class BamMultiReader { // ---------------------- // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") - bool CreateIndexes(bool useDefaultIndex = true); + bool CreateIndexes(bool useStandardIndex = true); //const int GetReferenceID(const string& refName) const; @@ -125,13 +118,13 @@ class BamMultiReader { private: // the set of readers and alignments which we operate on, maintained throughout the life of this class - vector > readers; + std::vector > readers; // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment // when a reader reaches EOF, its entry is removed from this index AlignmentIndex alignments; - vector fileNames; + std::vector fileNames; }; } // namespace BamTools diff --git a/src/api/BamReader.h b/src/api/BamReader.h index 6e863b6..132c95b 100644 --- a/src/api/BamReader.h +++ b/src/api/BamReader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 (DB) +// Last modified: 18 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -14,11 +14,8 @@ #ifndef BAMREADER_H #define BAMREADER_H -// C++ includes #include - -// BamTools includes -#include "BamAux.h" +#include "BamAlignment.h" namespace BamTools { @@ -42,7 +39,7 @@ class BamReader { bool IsIndexLoaded(void) const; // returns whether reader is open for reading or not bool IsOpen(void) const; - // performs random-access jump to reference, position + // performs random-access jump using (reference, position) as a left-bound bool Jump(int refID, int position = 0); // opens BAM file (and optional BAM index file, if provided) // @lookForIndex - if no indexFilename provided, look for an existing index file @@ -67,9 +64,8 @@ class BamReader { bool GetNextAlignment(BamAlignment& bAlignment); // retrieves next available alignment core data (returns success/fail) - // ** DOES NOT parse any character data (read name, bases, qualities, tag data) - // these can be accessed, if necessary, from the supportData - // useful for operations requiring ONLY positional or other alignment-related information + // ** DOES NOT parse any character data (read name, bases, qualities, tag data) ** + // useful for operations requiring ONLY aligner-related information (refId/position, alignment flags, CIGAR, mapQuality, etc) bool GetNextAlignmentCore(BamAlignment& bAlignment); // ---------------------- diff --git a/src/api/BamWriter.h b/src/api/BamWriter.h index b0cb6ce..106e3e9 100644 --- a/src/api/BamWriter.h +++ b/src/api/BamWriter.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) +// Last modified: 18 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -14,11 +14,8 @@ #ifndef BAMWRITER_H #define BAMWRITER_H -// C++ includes #include - -// BamTools includes -#include "BamAux.h" +#include "BamAlignment.h" namespace BamTools { -- 2.39.2