// ***************************************************************************
// BamAlignment.h (c) 2009 Derek Barnett
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 22 December 2010 (DB)
+// Last modified: 22 April 2011 (DB)
// ---------------------------------------------------------------------------
// Provides the BamAlignment data structure
// ***************************************************************************
namespace BamTools {
-// forward declare BamAlignment's friend classes
+// forward declaration of BamAlignment's friend classes
namespace Internal {
class BamReaderPrivate;
class BamWriterPrivate;
BamAlignment(const BamAlignment& other);
~BamAlignment(void);
- // Queries against alignment flags
+ // queries against alignment flags
public:
- bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate
- bool IsFailedQC(void) const; // Returns true if this read failed quality control
- bool IsFirstMate(void) const; // Returns true if alignment is first mate on read
- bool IsMapped(void) const; // Returns true if alignment is mapped
- bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped
- bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand
- bool IsPaired(void) const; // Returns true if alignment part of paired-end read
- bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment
- bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution
- bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand
- bool IsSecondMate(void) const; // Returns true if alignment is second mate on read
-
- // Manipulate alignment flags
+ bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
+ bool IsFailedQC(void) const; // returns true if this read failed quality control
+ bool IsFirstMate(void) const; // returns true if alignment is first mate on read
+ bool IsMapped(void) const; // returns true if alignment is mapped
+ bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
+ bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
+ bool IsPaired(void) const; // returns true if alignment part of paired-end read
+ bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
+ bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
+ bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
+ bool IsSecondMate(void) const; // returns true if alignment is second mate on read
+
+ // manipulate alignment flags
public:
- void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag
- void SetIsFailedQC(bool ok); // Sets "failed quality control" flag
- void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag
- void SetIsMapped(bool ok); // Sets "alignment is mapped" flag
- void SetIsMateMapped(bool ok); // Sets "alignment's mate is mapped" flag
- void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag
- void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag
- void SetIsPrimaryAlignment(bool ok); // Sets "position is primary alignment" flag
- void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag
- void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag
- void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag
-
- // legacy methods (deprecated, but available)
- void SetIsMateUnmapped(bool ok); // Complement of IsMateMapped() flag
- void SetIsSecondaryAlignment(bool ok); // Complement of IsPrimaryAlignment() flag
- void SetIsUnmapped(bool ok); // Complement of IsMapped() flag
-
- // Tag data access methods
+ void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
+ void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
+ void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
+ void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
+ void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
+ void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
+ void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
+ void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
+ void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
+ void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
+ void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
+
+ // legacy methods (consider deprecated, but still available)
+ void SetIsMateUnmapped(bool ok); // complement of using SetIsMateMapped()
+ void SetIsSecondaryAlignment(bool ok); // complement of using SetIsPrimaryAlignment()
+ void SetIsUnmapped(bool ok); // complement of using SetIsMapped()
+
+ // tag data access methods
public:
+
// -------------------------------------------------------------------------------------
// N.B. - The following tag access methods may not be used on BamAlignments fetched
// using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in
// error message (to keep output clean) but will ALWAYS return false. Only user-created
// BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here.
+ //
+ // You can call BuildCharData() on such an alignment retrieved by GetNextAlignmentCore().
+ // This populates all the character data, and will enable subsequent queries on tag data.
+ // -------------------------------------------------------------------------------------
- // add tag data (create new TAG entry with TYPE and VALUE)
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
- // returns true if new data added, false if error or TAG already exists
- // N.B. - will NOT modify existing tag. Use EditTag() instead
- // @tag - two character tag name
- // @type - single character tag type (see SAM/BAM spec for details)
- // @value - value to associate with tag
- bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
- bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
- bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
- bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
-
- // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
- // returns true if edit was successfaul, false if error
- // @tag - two character tag name
- // @type - single character tag type (see SAM/BAM spec for details)
- // @value - new value for tag
- bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
- bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
- bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
- bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
-
- // specific tag data access methods - these only remain for legacy support
- // returns whether specific tag could be retrieved
- bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance))
- bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup))
-
- // generic tag data access methods
- // returns whether tag is found & tag type is compatible with DESTINATION
- // @tag - two character tag name
- // @destination - if found, tag value is stored here
- bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings
- bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data
- bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data
- bool GetTag(const std::string& tag, float& destination) const; // access floating point data
-
- // retrieve the tag type code for TAG
- // returns true if tag could be found and type determined
+ // adds a tag
+ bool AddTag(const std::string& tag, const std::string& type, const std::string& value);
+ bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value);
+ bool AddTag(const std::string& tag, const std::string& type, const int32_t& value);
+ bool AddTag(const std::string& tag, const std::string& type, const float& value);
+
+ // adds a "binary array" tag
+ bool AddTag(const std::string& tag, const std::vector<uint8_t>& values);
+ bool AddTag(const std::string& tag, const std::vector<int8_t>& values);
+ bool AddTag(const std::string& tag, const std::vector<uint16_t>& values);
+ bool AddTag(const std::string& tag, const std::vector<int16_t>& values);
+ bool AddTag(const std::string& tag, const std::vector<uint32_t>& values);
+ bool AddTag(const std::string& tag, const std::vector<int32_t>& values);
+ bool AddTag(const std::string& tag, const std::vector<float>& values);
+
+ // edits a tag
+ bool EditTag(const std::string& tag, const std::string& type, const std::string& value);
+ bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value);
+ bool EditTag(const std::string& tag, const std::string& type, const int32_t& value);
+ bool EditTag(const std::string& tag, const std::string& type, const float& value);
+
+ // edits a "binary array" tag
+ bool EditTag(const std::string& tag, const std::vector<uint8_t>& values);
+ bool EditTag(const std::string& tag, const std::vector<int8_t>& values);
+ bool EditTag(const std::string& tag, const std::vector<uint16_t>& values);
+ bool EditTag(const std::string& tag, const std::vector<int16_t>& values);
+ bool EditTag(const std::string& tag, const std::vector<uint32_t>& values);
+ bool EditTag(const std::string& tag, const std::vector<int32_t>& values);
+ bool EditTag(const std::string& tag, const std::vector<float>& values);
+
+ // retrieves data for a tag
+ bool GetTag(const std::string& tag, std::string& destination) const;
+ bool GetTag(const std::string& tag, uint32_t& destination) const;
+ bool GetTag(const std::string& tag, int32_t& destination) const;
+ bool GetTag(const std::string& tag, float& destination) const;
+
+ // retrieves data for a "binary array" tag
+ bool GetTag(const std::string& tag, std::vector<uint32_t>& destination) const;
+ bool GetTag(const std::string& tag, std::vector<int32_t>& destination) const;
+ bool GetTag(const std::string& tag, std::vector<float>& destination) const;
+
+ // retrieves the BAM tag-type character for a tag
bool GetTagType(const std::string& tag, char& type) const;
+
+ // legacy methods (consider deprecated, but still available)
+ bool GetEditDistance(uint32_t& editDistance) const; // retrieves value of "NM" tag
+ bool GetReadGroup(std::string& readGroup) const; // retrieves value of "RG" tag
- // remove tag data
- // returns true if removal was successful, false if error
- // N.B. - returns false if TAG does not exist (no removal can occur)
- // @tag - two character tag name
+ // returns true if alignment has a record for this tag name
+ bool HasTag(const std::string& tag) const;
+
+ // removes a tag
bool RemoveTag(const std::string& tag);
- // Populate an alignment retrieved by BamAlignment::GetNextAlignmentCore() with full character data
- // (read name, bases, qualities, tag data)
+ // additional methods
public:
+ // populates alignment string fields
bool BuildCharData(void);
-
- // Additional data access methods
- public:
- // calculates & returns alignment end position, based on starting position and CIGAR operations
- // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference
- // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats.
+ // calculates alignment end position
int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
- // 'internal' utility methods
- private:
- static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);
- static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);
-
- // Data members
+ // public data fields
public:
- std::string Name; // Read name
- int32_t Length; // Query length
- std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
- std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
+ std::string Name; // read name
+ int32_t Length; // length of query sequence
+ std::string QueryBases; // 'original' sequence (as reported from sequencing machine)
+ std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping)
std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
- std::string TagData; // Tag data (accessor methods will pull the requested information out)
+ std::string TagData; // tag data (use provided methods to query/modify)
int32_t RefID; // ID number for reference sequence
- int32_t Position; // Position (0-based) where alignment starts
- uint16_t Bin; // Bin in BAM file where this alignment resides
- uint16_t MapQuality; // Mapping quality score
- uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
+ int32_t Position; // position (0-based) where alignment starts
+ uint16_t Bin; // BAM (standard) index bin number for this alignment
+ uint16_t MapQuality; // mapping quality score
+ uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
- int32_t MatePosition; // Position (0-based) where alignment's mate starts
- int32_t InsertSize; // Mate-pair insert size
-
- // Internal data, inaccessible to client code
- // but available BamReaderPrivate & BamWriterPrivate
+ int32_t MatePosition; // position (0-based) where alignment's mate starts
+ int32_t InsertSize; // mate-pair insert size
+ std::string Filename; // name of BAM file which this alignment comes from
+
+ //! \cond
+ // internal utility methods
private:
+ bool FindTag(const std::string& tag,
+ char*& pTagData,
+ const unsigned int& tagDataLength,
+ unsigned int& numBytesParsed) const;
+ bool IsValidSize(const std::string& tag,
+ const std::string& type) const;
+ bool SkipToNextTag(const char storageType,
+ char*& pTagData,
+ unsigned int& numBytesParsed) const;
+
+ // internal data
+ private:
+
struct BamAlignmentSupportData {
// data members
, HasCoreOnly(false)
{ }
};
- BamAlignmentSupportData SupportData;
- friend class Internal::BamReaderPrivate;
- friend class Internal::BamWriterPrivate;
-
- // Alignment flag query constants
- // Use the get/set methods above instead
- private:
- enum { PAIRED = 1
- , PROPER_PAIR = 2
- , UNMAPPED = 4
- , MATE_UNMAPPED = 8
- , REVERSE = 16
- , MATE_REVERSE = 32
- , READ_1 = 64
- , READ_2 = 128
- , SECONDARY = 256
- , QC_FAILED = 512
- , DUPLICATE = 1024
- };
+ BamAlignmentSupportData SupportData;
+ friend class Internal::BamReaderPrivate;
+ friend class Internal::BamWriterPrivate;
+ //! \endcond
};
-// convenience typedef(s)
typedef std::vector<BamAlignment> BamAlignmentVector;
} // namespace BamTools