1 // ***************************************************************************
\r
2 // BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 15 September 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // Provides the basic constants, data structures, etc. for using BAM files
\r
9 // ***************************************************************************
\r
21 #include <exception>
\r
29 // Platform-specific type definitions
\r
30 #ifndef BAMTOOLS_TYPES
\r
31 #define BAMTOOLS_TYPES
\r
33 typedef char int8_t;
\r
34 typedef unsigned char uint8_t;
\r
35 typedef short int16_t;
\r
36 typedef unsigned short uint16_t;
\r
37 typedef int int32_t;
\r
38 typedef unsigned int uint32_t;
\r
39 typedef long long int64_t;
\r
40 typedef unsigned long long uint64_t;
\r
44 #endif // BAMTOOLS_TYPES
\r
46 namespace BamTools {
\r
49 const int BAM_CORE_SIZE = 32;
\r
50 const int BAM_CMATCH = 0;
\r
51 const int BAM_CINS = 1;
\r
52 const int BAM_CDEL = 2;
\r
53 const int BAM_CREF_SKIP = 3;
\r
54 const int BAM_CSOFT_CLIP = 4;
\r
55 const int BAM_CHARD_CLIP = 5;
\r
56 const int BAM_CPAD = 6;
\r
57 const int BAM_CIGAR_SHIFT = 4;
\r
58 const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
\r
60 // BAM index constants
\r
61 const int MAX_BIN = 37450; // =(8^6-1)/7+1
\r
62 const int BAM_MIN_CHUNK_GAP = 32768;
\r
63 const int BAM_LIDX_SHIFT = 14;
\r
65 // Explicit variable sizes
\r
66 const int BT_SIZEOF_INT = 4;
\r
70 struct BamAlignment {
\r
72 // constructors & destructor
\r
75 BamAlignment(const BamAlignment& other);
\r
76 ~BamAlignment(void);
\r
78 // Queries against alignment flags
\r
80 bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate
\r
81 bool IsFailedQC(void) const; // Returns true if this read failed quality control
\r
82 bool IsFirstMate(void) const; // Returns true if alignment is first mate on read
\r
83 bool IsMapped(void) const; // Returns true if alignment is mapped
\r
84 bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped
\r
85 bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand
\r
86 bool IsPaired(void) const; // Returns true if alignment part of paired-end read
\r
87 bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment
\r
88 bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution
\r
89 bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand
\r
90 bool IsSecondMate(void) const; // Returns true if alignment is second mate on read
\r
92 // Manipulate alignment flags
\r
94 void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag
\r
95 void SetIsFailedQC(bool ok); // Sets "failed quality control" flag
\r
96 void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag
\r
97 void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag
\r
98 void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag
\r
99 void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag
\r
100 void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag
\r
101 void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag
\r
102 void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag
\r
103 void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag
\r
104 void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag
\r
106 // Tag data access methods
\r
108 // -------------------------------------------------------------------------------------
\r
109 // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched
\r
110 // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in
\r
111 // error message (to keep output clean) but will ALWAYS return false. Only user-
\r
112 // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid.
\r
114 // add tag data (create new TAG entry with TYPE and VALUE)
\r
115 // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
\r
116 // returns true if new data added, false if error or TAG already exists
\r
117 // N.B. - will NOT modify existing tag. Use EditTag() instead
\r
118 bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
\r
119 bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
\r
120 bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
\r
121 bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
\r
123 // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)
\r
124 // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
\r
125 // returns true if edit was successfaul, false if error
\r
126 bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
\r
127 bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
\r
128 bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
\r
129 bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
\r
131 // specific tag data access methods - these only remain for legacy support
\r
132 bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance))
\r
133 bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup))
\r
135 // generic tag data access methods
\r
136 bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings
\r
137 bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data
\r
138 bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data
\r
139 bool GetTag(const std::string& tag, float& destination) const; // access floating point data
\r
142 // returns true if removal was successful, false if error
\r
143 // N.B. - returns false if TAG does not exist (no removal can occur)
\r
144 bool RemoveTag(const std::string& tag);
\r
146 // Additional data access methods
\r
148 // calculates alignment end position, based on starting position and CIGAR operations
\r
149 // @zeroBased - if true, returns 0-based coordinate; else returns 1-based
\r
150 int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
\r
152 // 'internal' utility methods
\r
154 static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);
\r
155 static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);
\r
159 std::string Name; // Read name
\r
160 int32_t Length; // Query length
\r
161 std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
\r
162 std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
\r
163 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
\r
164 std::string TagData; // Tag data (accessor methods will pull the requested information out)
\r
165 int32_t RefID; // ID number for reference sequence
\r
166 int32_t Position; // Position (0-based) where alignment starts
\r
167 uint16_t Bin; // Bin in BAM file where this alignment resides
\r
168 uint16_t MapQuality; // Mapping quality score
\r
169 uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
\r
170 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
\r
171 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
\r
172 int32_t MatePosition; // Position (0-based) where alignment's mate starts
\r
173 int32_t InsertSize; // Mate-pair insert size
\r
177 struct BamAlignmentSupportData {
\r
180 std::string AllCharData;
\r
181 uint32_t BlockLength;
\r
182 uint32_t NumCigarOperations;
\r
183 uint32_t QueryNameLength;
\r
184 uint32_t QuerySequenceLength;
\r
188 BamAlignmentSupportData(void)
\r
190 , NumCigarOperations(0)
\r
191 , QueryNameLength(0)
\r
192 , QuerySequenceLength(0)
\r
193 , HasCoreOnly(false)
\r
197 // contains raw character data & lengths
\r
198 BamAlignmentSupportData SupportData;
\r
200 // allow these classes access to BamAlignment private members (SupportData)
\r
201 // but client code should not need to touch this data
\r
202 friend class BamReader;
\r
203 friend class BamWriter;
\r
205 // Alignment flag query constants
\r
206 // Use the get/set methods above instead
\r
211 , MATE_UNMAPPED = 8
\r
213 , MATE_REVERSE = 32
\r
218 , DUPLICATE = 1024
\r
222 // ----------------------------------------------------------------
\r
223 // Auxiliary data structs & typedefs
\r
228 char Type; // Operation type (MIDNSHP)
\r
229 uint32_t Length; // Operation length (number of bases)
\r
232 CigarOp(const char type = '\0',
\r
233 const uint32_t length = 0)
\r
242 std::string RefName; // Name of reference sequence
\r
243 int32_t RefLength; // Length of reference sequence
\r
244 bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
\r
247 RefData(const int32_t& length = 0,
\r
249 : RefLength(length)
\r
250 , RefHasAlignments(ok)
\r
254 typedef std::vector<RefData> RefVector;
\r
255 typedef std::vector<BamAlignment> BamAlignmentVector;
\r
266 BamRegion(const int& leftID = -1,
\r
267 const int& leftPos = -1,
\r
268 const int& rightID = -1,
\r
269 const int& rightPos = -1)
\r
270 : LeftRefID(leftID)
\r
271 , LeftPosition(leftPos)
\r
272 , RightRefID(rightID)
\r
273 , RightPosition(rightPos)
\r
276 // member functions
\r
277 void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; }
\r
278 bool isLeftBoundSpecified(void) const { return ( LeftRefID != -1 && LeftPosition != -1 ); }
\r
279 bool isNull(void) const { return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); }
\r
280 bool isRightBoundSpecified(void) const { return ( RightRefID != -1 && RightPosition != -1 ); }
\r
283 // ----------------------------------------------------------------
\r
284 // Added: 3-35-2010 DWB
\r
285 // Fixed: Routines to provide endian-correctness
\r
286 // ----------------------------------------------------------------
\r
288 // returns true if system is big endian
\r
289 inline bool SystemIsBigEndian(void) {
\r
290 const uint16_t one = 0x0001;
\r
291 return ((*(char*) &one) == 0 );
\r
294 // swaps endianness of 16-bit value 'in place'
\r
295 inline void SwapEndian_16(int16_t& x) {
\r
296 x = ((x >> 8) | (x << 8));
\r
299 inline void SwapEndian_16(uint16_t& x) {
\r
300 x = ((x >> 8) | (x << 8));
\r
303 // swaps endianness of 32-bit value 'in-place'
\r
304 inline void SwapEndian_32(int32_t& x) {
\r
306 ((x << 8) & 0x00FF0000) |
\r
307 ((x >> 8) & 0x0000FF00) |
\r
312 inline void SwapEndian_32(uint32_t& x) {
\r
314 ((x << 8) & 0x00FF0000) |
\r
315 ((x >> 8) & 0x0000FF00) |
\r
320 // swaps endianness of 64-bit value 'in-place'
\r
321 inline void SwapEndian_64(int64_t& x) {
\r
323 ((x << 40) & 0x00FF000000000000ll) |
\r
324 ((x << 24) & 0x0000FF0000000000ll) |
\r
325 ((x << 8) & 0x000000FF00000000ll) |
\r
326 ((x >> 8) & 0x00000000FF000000ll) |
\r
327 ((x >> 24) & 0x0000000000FF0000ll) |
\r
328 ((x >> 40) & 0x000000000000FF00ll) |
\r
333 inline void SwapEndian_64(uint64_t& x) {
\r
335 ((x << 40) & 0x00FF000000000000ll) |
\r
336 ((x << 24) & 0x0000FF0000000000ll) |
\r
337 ((x << 8) & 0x000000FF00000000ll) |
\r
338 ((x >> 8) & 0x00000000FF000000ll) |
\r
339 ((x >> 24) & 0x0000000000FF0000ll) |
\r
340 ((x >> 40) & 0x000000000000FF00ll) |
\r
345 // swaps endianness of 'next 2 bytes' in a char buffer (in-place)
\r
346 inline void SwapEndian_16p(char* data) {
\r
347 uint16_t& value = (uint16_t&)*data;
\r
348 SwapEndian_16(value);
\r
351 // swaps endianness of 'next 4 bytes' in a char buffer (in-place)
\r
352 inline void SwapEndian_32p(char* data) {
\r
353 uint32_t& value = (uint32_t&)*data;
\r
354 SwapEndian_32(value);
\r
357 // swaps endianness of 'next 8 bytes' in a char buffer (in-place)
\r
358 inline void SwapEndian_64p(char* data) {
\r
359 uint64_t& value = (uint64_t&)*data;
\r
360 SwapEndian_64(value);
\r
363 inline bool FileExists(const std::string& filename) {
\r
364 std::ifstream f(filename.c_str(), std::ifstream::in);
\r
368 // ----------------------------------------------------------------
\r
369 // BamAlignment member methods
\r
371 // constructors & destructor
\r
372 inline BamAlignment::BamAlignment(void) { }
\r
374 inline BamAlignment::BamAlignment(const BamAlignment& other)
\r
376 , Length(other.Length)
\r
377 , QueryBases(other.QueryBases)
\r
378 , AlignedBases(other.AlignedBases)
\r
379 , Qualities(other.Qualities)
\r
380 , TagData(other.TagData)
\r
381 , RefID(other.RefID)
\r
382 , Position(other.Position)
\r
384 , MapQuality(other.MapQuality)
\r
385 , AlignmentFlag(other.AlignmentFlag)
\r
386 , CigarData(other.CigarData)
\r
387 , MateRefID(other.MateRefID)
\r
388 , MatePosition(other.MatePosition)
\r
389 , InsertSize(other.InsertSize)
\r
390 , SupportData(other.SupportData)
\r
393 inline BamAlignment::~BamAlignment(void) { }
\r
395 // Queries against alignment flags
\r
396 inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
\r
397 inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
\r
398 inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
\r
399 inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
\r
400 inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
\r
401 inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
\r
402 inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
\r
403 inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
\r
404 inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
\r
405 inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
\r
406 inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
\r
408 // Manipulate alignment flags
\r
409 inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
\r
410 inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
\r
411 inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
\r
412 inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
\r
413 inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
\r
414 inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
\r
415 inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
\r
416 inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
\r
417 inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
\r
418 inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
\r
419 inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
\r
421 // calculates alignment end position, based on starting position and CIGAR operations
\r
423 int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
\r
425 // initialize alignment end to starting position
\r
426 int alignEnd = Position;
\r
428 // iterate over cigar operations
\r
429 std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
\r
430 std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
\r
431 for ( ; cigarIter != cigarEnd; ++cigarIter) {
\r
432 const char cigarType = (*cigarIter).Type;
\r
433 if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
\r
434 alignEnd += (*cigarIter).Length;
\r
436 else if ( usePadded && cigarType == 'I' ) {
\r
437 alignEnd += (*cigarIter).Length;
\r
441 // adjust for zeroBased, if necessary
\r
443 return alignEnd - 1;
\r
449 bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {
\r
451 if ( SupportData.HasCoreOnly ) return false;
\r
452 if ( tag.size() != 2 || type.size() != 1 ) return false;
\r
453 if ( type != "Z" && type != "H" ) return false;
\r
455 // localize the tag data
\r
456 char* pTagData = (char*)TagData.data();
\r
457 const unsigned int tagDataLength = TagData.size();
\r
458 unsigned int numBytesParsed = 0;
\r
460 // if tag already exists, return false
\r
461 // use EditTag explicitly instead
\r
462 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
\r
464 // otherwise, copy tag data to temp buffer
\r
465 std::string newTag = tag + type + value;
\r
466 const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
\r
467 char originalTagData[newTagDataLength];
\r
468 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
\r
471 strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
\r
473 // store temp buffer back in TagData
\r
474 const char* newTagData = (const char*)originalTagData;
\r
475 TagData.assign(newTagData, newTagDataLength);
\r
482 bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {
\r
484 if ( SupportData.HasCoreOnly ) return false;
\r
485 if ( tag.size() != 2 || type.size() != 1 ) return false;
\r
486 if ( type == "f" || type == "Z" || type == "H" ) return false;
\r
488 // localize the tag data
\r
489 char* pTagData = (char*)TagData.data();
\r
490 const unsigned int tagDataLength = TagData.size();
\r
491 unsigned int numBytesParsed = 0;
\r
493 // if tag already exists, return false
\r
494 // use EditTag explicitly instead
\r
495 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
\r
497 // otherwise, convert value to string
\r
498 union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
\r
501 // copy original tag data to temp buffer
\r
502 std::string newTag = tag + type;
\r
503 const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
\r
504 char originalTagData[newTagDataLength];
\r
505 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
\r
508 strcat(originalTagData + tagDataLength, newTag.data());
\r
509 memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));
\r
511 // store temp buffer back in TagData
\r
512 const char* newTagData = (const char*)originalTagData;
\r
513 TagData.assign(newTagData, newTagDataLength);
\r
520 bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {
\r
521 return AddTag(tag, type, (const uint32_t&)value);
\r
525 bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {
\r
527 if ( SupportData.HasCoreOnly ) return false;
\r
528 if ( tag.size() != 2 || type.size() != 1 ) return false;
\r
529 if ( type == "Z" || type == "H" ) return false;
\r
531 // localize the tag data
\r
532 char* pTagData = (char*)TagData.data();
\r
533 const unsigned int tagDataLength = TagData.size();
\r
534 unsigned int numBytesParsed = 0;
\r
536 // if tag already exists, return false
\r
537 // use EditTag explicitly instead
\r
538 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
\r
540 // otherwise, convert value to string
\r
541 union { float value; char valueBuffer[sizeof(float)]; } un;
\r
544 // copy original tag data to temp buffer
\r
545 std::string newTag = tag + type;
\r
546 const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
\r
547 char originalTagData[newTagDataLength];
\r
548 memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
\r
551 strcat(originalTagData + tagDataLength, newTag.data());
\r
552 memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
\r
554 // store temp buffer back in TagData
\r
555 const char* newTagData = (const char*)originalTagData;
\r
556 TagData.assign(newTagData, newTagDataLength);
\r
563 bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {
\r
565 if ( SupportData.HasCoreOnly ) return false;
\r
566 if ( tag.size() != 2 || type.size() != 1 ) return false;
\r
567 if ( type != "Z" && type != "H" ) return false;
\r
569 // localize the tag data
\r
570 char* pOriginalTagData = (char*)TagData.data();
\r
571 char* pTagData = pOriginalTagData;
\r
572 const unsigned int originalTagDataLength = TagData.size();
\r
574 unsigned int newTagDataLength = 0;
\r
575 unsigned int numBytesParsed = 0;
\r
577 // if tag found, store data in readGroup, return success
\r
578 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
\r
580 // make sure array is more than big enough
\r
581 char newTagData[originalTagDataLength + value.size()];
\r
583 // copy original tag data up til desired tag
\r
584 const unsigned int beginningTagDataLength = numBytesParsed;
\r
585 newTagDataLength += beginningTagDataLength;
\r
586 memcpy(newTagData, pOriginalTagData, numBytesParsed);
\r
588 // copy new VALUE in place of current tag data
\r
589 const unsigned int dataLength = strlen(value.c_str());
\r
590 memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
\r
592 // skip to next tag (if tag for removal is last, return true)
\r
593 const char* pTagStorageType = pTagData - 1;
\r
594 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
\r
596 // copy everything from current tag (the next one after tag for removal) to end
\r
597 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
\r
598 const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
\r
599 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
\r
600 memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
\r
602 // ensure null-terminator
\r
603 newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
\r
605 // save new tag data
\r
606 TagData.assign(newTagData, endTagOffset + endTagDataLength);
\r
610 // tag not found, attempt AddTag
\r
611 else return AddTag(tag, type, value);
\r
615 bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {
\r
617 if ( SupportData.HasCoreOnly ) return false;
\r
618 if ( tag.size() != 2 || type.size() != 1 ) return false;
\r
619 if ( type == "f" || type == "Z" || type == "H" ) return false;
\r
621 // localize the tag data
\r
622 char* pOriginalTagData = (char*)TagData.data();
\r
623 char* pTagData = pOriginalTagData;
\r
624 const unsigned int originalTagDataLength = TagData.size();
\r
626 unsigned int newTagDataLength = 0;
\r
627 unsigned int numBytesParsed = 0;
\r
629 // if tag found, store data in readGroup, return success
\r
630 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
\r
632 // make sure array is more than big enough
\r
633 char newTagData[originalTagDataLength + sizeof(value)];
\r
635 // copy original tag data up til desired tag
\r
636 const unsigned int beginningTagDataLength = numBytesParsed;
\r
637 newTagDataLength += beginningTagDataLength;
\r
638 memcpy(newTagData, pOriginalTagData, numBytesParsed);
\r
640 // copy new VALUE in place of current tag data
\r
641 union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
\r
643 memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));
\r
645 // skip to next tag (if tag for removal is last, return true)
\r
646 const char* pTagStorageType = pTagData - 1;
\r
647 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
\r
649 // copy everything from current tag (the next one after tag for removal) to end
\r
650 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
\r
651 const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);
\r
652 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
\r
653 memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
\r
655 // ensure null-terminator
\r
656 newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
\r
658 // save new tag data
\r
659 TagData.assign(newTagData, endTagOffset + endTagDataLength);
\r
663 // tag not found, attempt AddTag
\r
664 else return AddTag(tag, type, value);
\r
668 bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {
\r
669 return EditTag(tag, type, (const uint32_t&)value);
\r
673 bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {
\r
675 if ( SupportData.HasCoreOnly ) return false;
\r
676 if ( tag.size() != 2 || type.size() != 1 ) return false;
\r
677 if ( type == "Z" || type == "H" ) return false;
\r
679 // localize the tag data
\r
680 char* pOriginalTagData = (char*)TagData.data();
\r
681 char* pTagData = pOriginalTagData;
\r
682 const unsigned int originalTagDataLength = TagData.size();
\r
684 unsigned int newTagDataLength = 0;
\r
685 unsigned int numBytesParsed = 0;
\r
687 // if tag found, store data in readGroup, return success
\r
688 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
\r
690 // make sure array is more than big enough
\r
691 char newTagData[originalTagDataLength + sizeof(value)];
\r
693 // copy original tag data up til desired tag
\r
694 const unsigned int beginningTagDataLength = numBytesParsed;
\r
695 newTagDataLength += beginningTagDataLength;
\r
696 memcpy(newTagData, pOriginalTagData, numBytesParsed);
\r
698 // copy new VALUE in place of current tag data
\r
699 union { float value; char valueBuffer[sizeof(float)]; } un;
\r
701 memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
\r
703 // skip to next tag (if tag for removal is last, return true)
\r
704 const char* pTagStorageType = pTagData - 1;
\r
705 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
\r
707 // copy everything from current tag (the next one after tag for removal) to end
\r
708 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
\r
709 const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
\r
710 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
\r
711 memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
\r
713 // ensure null-terminator
\r
714 newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
\r
716 // save new tag data
\r
717 TagData.assign(newTagData, endTagOffset + endTagDataLength);
\r
721 // tag not found, attempt AddTag
\r
722 else return AddTag(tag, type, value);
\r
725 // get "NM" tag data - originally contributed by Aaron Quinlan
\r
726 // stores data in 'editDistance', returns success/fail
\r
728 bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
\r
729 return GetTag("NM", (uint32_t&)editDistance);
\r
732 // get "RG" tag data
\r
733 // stores data in 'readGroup', returns success/fail
\r
735 bool BamAlignment::GetReadGroup(std::string& readGroup) const {
\r
736 return GetTag("RG", readGroup);
\r
740 bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {
\r
742 // make sure tag data exists
\r
743 if ( SupportData.HasCoreOnly || TagData.empty() )
\r
746 // localize the tag data
\r
747 char* pTagData = (char*)TagData.data();
\r
748 const unsigned int tagDataLength = TagData.size();
\r
749 unsigned int numBytesParsed = 0;
\r
751 // if tag found, store data in readGroup, return success
\r
752 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
\r
753 const unsigned int dataLength = strlen(pTagData);
\r
754 destination.clear();
\r
755 destination.resize(dataLength);
\r
756 memcpy( (char*)destination.data(), pTagData, dataLength );
\r
760 // tag not found, return failure
\r
765 bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {
\r
767 // make sure tag data exists
\r
768 if ( SupportData.HasCoreOnly || TagData.empty() )
\r
771 // localize the tag data
\r
772 char* pTagData = (char*)TagData.data();
\r
773 const unsigned int tagDataLength = TagData.size();
\r
774 unsigned int numBytesParsed = 0;
\r
776 // if tag found, determine data byte-length, store data in readGroup, return success
\r
777 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
\r
779 // determine data byte-length
\r
780 const char type = *(pTagData - 1);
\r
781 int destinationLength = 0;
\r
787 destinationLength = 1;
\r
793 destinationLength = 2;
\r
799 destinationLength = 4;
\r
802 // unsupported type for integer destination (float or var-length strings)
\r
806 fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
\r
809 // unknown tag type
\r
811 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
\r
815 // store in destination
\r
817 memcpy(&destination, pTagData, destinationLength);
\r
821 // tag not found, return failure
\r
826 bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {
\r
827 return GetTag(tag, (uint32_t&)destination);
\r
831 bool BamAlignment::GetTag(const std::string& tag, float& destination) const {
\r
833 // make sure tag data exists
\r
834 if ( SupportData.HasCoreOnly || TagData.empty() )
\r
837 // localize the tag data
\r
838 char* pTagData = (char*)TagData.data();
\r
839 const unsigned int tagDataLength = TagData.size();
\r
840 unsigned int numBytesParsed = 0;
\r
842 // if tag found, determine data byte-length, store data in readGroup, return success
\r
843 if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
\r
844 //pTagData += numBytesParsed;
\r
846 // determine data byte-length
\r
847 const char type = *(pTagData - 1);
\r
848 int destinationLength = 0;
\r
855 destinationLength = 1;
\r
861 destinationLength = 2;
\r
868 destinationLength = 4;
\r
871 // unsupported type (var-length strings)
\r
874 fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
\r
877 // unknown tag type
\r
879 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
\r
883 // store in destination
\r
885 memcpy(&destination, pTagData, destinationLength);
\r
889 // tag not found, return failure
\r
894 bool BamAlignment::RemoveTag(const std::string& tag) {
\r
896 // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed
\r
897 // also, return false if no data present to remove
\r
898 if ( SupportData.HasCoreOnly || TagData.empty() ) return false;
\r
900 // localize the tag data
\r
901 char* pOriginalTagData = (char*)TagData.data();
\r
902 char* pTagData = pOriginalTagData;
\r
903 const unsigned int originalTagDataLength = TagData.size();
\r
904 unsigned int newTagDataLength = 0;
\r
905 unsigned int numBytesParsed = 0;
\r
907 // if tag found, store data in readGroup, return success
\r
908 if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
\r
910 char newTagData[originalTagDataLength];
\r
912 // copy original tag data up til desired tag
\r
914 numBytesParsed -= 3;
\r
915 const unsigned int beginningTagDataLength = numBytesParsed;
\r
916 newTagDataLength += beginningTagDataLength;
\r
917 memcpy(newTagData, pOriginalTagData, numBytesParsed);
\r
919 // skip to next tag (if tag for removal is last, return true)
\r
920 const char* pTagStorageType = pTagData + 2;
\r
922 numBytesParsed += 3;
\r
923 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
\r
925 // copy everything from current tag (the next one after tag for removal) to end
\r
926 const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
\r
927 const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
\r
928 memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
\r
930 // save new tag data
\r
931 TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
\r
935 // tag not found, no removal - return failure
\r
940 bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {
\r
942 while ( numBytesParsed < tagDataLength ) {
\r
944 const char* pTagType = pTagData;
\r
945 const char* pTagStorageType = pTagData + 2;
\r
947 numBytesParsed += 3;
\r
949 // check the current tag, return true on match
\r
950 if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 )
\r
953 // get the storage class and find the next tag
\r
954 if ( *pTagStorageType == '\0' ) return false;
\r
955 if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
\r
956 if ( *pTagData == '\0' ) return false;
\r
959 // checked all tags, none match
\r
964 bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
\r
966 switch(storageType) {
\r
977 numBytesParsed += 2;
\r
984 numBytesParsed += 4;
\r
994 // increment for null-terminator
\r
1001 fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);
\r
1009 } // namespace BamTools
\r
1011 #endif // BAMAUX_H
\r