]> git.donarmstrong.com Git - bamtools.git/commitdiff
Added support for the tag data and provided a GetReadGroup accessor.
authormikaels <mikaels@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Sat, 11 Apr 2009 03:31:59 +0000 (03:31 +0000)
committermikaels <mikaels@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Sat, 11 Apr 2009 03:31:59 +0000 (03:31 +0000)
git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@10 9efb377e-2e27-44b9-b91a-ec4abb80ed8b

BamAlignment.h

index e876f9be2fddcfe68e68e589facbe9196863d6b4..644f875b6a357ce754aacfa8a67b742a1127ff3f 100644 (file)
-// BamAlignment.h\r
-\r
-// Derek Barnett\r
-// Marth Lab, Boston College\r
-// Last modified: 20 March 2009\r
-\r
-#ifndef BAMALIGNMENT_H\r
-#define BAMALIGNMENT_H\r
-\r
-#ifdef WIN32\r
-typedef char                 int8_t;\r
-typedef unsigned char       uint8_t;\r
-typedef short               int16_t;\r
-typedef unsigned short     uint16_t;\r
-typedef int                 int32_t;\r
-typedef unsigned int       uint32_t;\r
-typedef long long           int64_t;\r
-typedef unsigned long long uint64_t;\r
-#else\r
-#include <stdint.h>\r
-#endif\r
-\r
-// C++ includes\r
-#include <string>\r
-using std::string;\r
-\r
-#include <vector>\r
-using std::vector;\r
-\r
-struct CigarOp {\r
-       uint32_t Length;\r
-       char     Type;\r
-};\r
-\r
-struct RefData {\r
-  string       RefName;\r
-  unsigned int RefLength;\r
-  bool         RefHasAlignments;\r
-\r
-  // constructor\r
-  RefData(void)\r
-  : RefLength(0)\r
-  , RefHasAlignments(false)\r
-  { }\r
-};\r
-\r
-typedef vector<RefData> RefVector;\r
-\r
-struct BamAlignment {\r
-\r
-    // queries against alignment flag - see below for further detail\r
-    public:\r
-        bool IsPaired(void) const            { return ( (AlignmentFlag & PAIRED)        != 0 ); }\r
-        bool IsProperPair(void) const        { return ( (AlignmentFlag & PROPER_PAIR)   != 0 ); }\r
-        bool IsMapped(void) const            { return ( (AlignmentFlag & UNMAPPED)      == 0 ); }\r
-        bool IsMateMapped(void) const        { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
-        bool IsReverseStrand(void) const     { return ( (AlignmentFlag & REVERSE)       != 0 ); }\r
-        bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }\r
-        bool IsFirstMate(void) const         { return ( (AlignmentFlag & READ_1)        != 0 ); }\r
-        bool IsSecondMate(void) const        { return ( (AlignmentFlag & READ_2)        != 0 ); }\r
-        bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY)     == 0 ); }\r
-        bool IsFailedQC(void) const          { return ( (AlignmentFlag & QC_FAILED)     != 0 ); }\r
-        bool IsDuplicate(void) const         { return ( (AlignmentFlag & DUPLICATE)     != 0 ); }\r
-\r
-    // data members\r
-    public:\r
-        string       Name;           // read name\r
-        unsigned int Length;         // query length\r
-        string       QueryBases;     // original sequence ( produced from machine )\r
-        string       AlignedBases;   // aligned sequence ( with indels ) \r
-        string       Qualities;      // FASTQ qualities ( still in ASCII characters )\r
-               vector<string> Tags;\r
-        unsigned int RefID;          // ID for reference sequence\r
-        unsigned int Position;       // position on reference sequence where alignment starts\r
-        unsigned int Bin;            // bin in BAM file where this alignment resides\r
-        unsigned int MapQuality;     // mapping quality \r
-        unsigned int AlignmentFlag;  // see above for available queries\r
-               vector<CigarOp> CigarData;   // vector of CIGAR operations (length & type) )\r
-        unsigned int MateRefID;      // ID for reference sequence that mate was aligned to\r
-        unsigned int MatePosition;   // position that mate was aligned to\r
-        unsigned int InsertSize;     // mate pair insert size\r
-               \r
-\r
-    // alignment flag query constants\r
-    private:\r
-        enum { PAIRED        = 1,              // Alignment comes from paired-end data\r
-               PROPER_PAIR   = 2,              // Alignment passed paired-end resolution\r
-               UNMAPPED      = 4,              // Read is unmapped\r
-               MATE_UNMAPPED = 8,              // Mate is unmapped\r
-               REVERSE       = 16,             // Read is on reverse strand\r
-               MATE_REVERSE  = 32,             // Mate is on reverse strand\r
-               READ_1        = 64,             // This alignment is mate 1 of pair\r
-               READ_2        = 128,            // This alignment is mate 2 of pair\r
-               SECONDARY     = 256,            // This alignment is not the primary (best) alignment for read\r
-               QC_FAILED     = 512,            // Read did not pass prior quality control steps\r
-               DUPLICATE     = 1024            // Read is PCR duplicate\r
-        };\r
-};\r
-\r
-// commonly used vector in this library\r
-typedef vector< BamAlignment > BamAlignmentVector;\r
-\r
-#endif /* BAMALIGNMENT_H */\r
+// BamAlignment.h
+
+// Derek Barnett
+// Marth Lab, Boston College
+// Last modified: 20 March 2009
+
+#ifndef BAMALIGNMENT_H
+#define BAMALIGNMENT_H
+
+#ifdef WIN32
+typedef char                 int8_t;
+typedef unsigned char       uint8_t;
+typedef short               int16_t;
+typedef unsigned short     uint16_t;
+typedef int                 int32_t;
+typedef unsigned int       uint32_t;
+typedef long long           int64_t;
+typedef unsigned long long uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+// C++ includes
+#include <string>
+using std::string;
+
+#include <vector>
+using std::vector;
+
+struct CigarOp {
+       uint32_t Length;
+       char     Type;
+};
+
+struct RefData {
+       string       RefName;
+       unsigned int RefLength;
+       bool         RefHasAlignments;
+
+       // constructor
+       RefData(void)
+               : RefLength(0)
+               , RefHasAlignments(false)
+       { }
+};
+
+typedef vector<RefData> RefVector;
+
+struct BamAlignment {
+
+       // queries against alignment flag - see below for further detail
+public:
+       bool IsPaired(void) const            { return ( (AlignmentFlag & PAIRED)        != 0 ); }
+       bool IsProperPair(void) const        { return ( (AlignmentFlag & PROPER_PAIR)   != 0 ); }
+       bool IsMapped(void) const            { return ( (AlignmentFlag & UNMAPPED)      == 0 ); }
+       bool IsMateMapped(void) const        { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
+       bool IsReverseStrand(void) const     { return ( (AlignmentFlag & REVERSE)       != 0 ); }
+       bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE)  != 0 ); }
+       bool IsFirstMate(void) const         { return ( (AlignmentFlag & READ_1)        != 0 ); }
+       bool IsSecondMate(void) const        { return ( (AlignmentFlag & READ_2)        != 0 ); }
+       bool IsPrimaryAlignment(void) const  { return ( (AlignmentFlag & SECONDARY)     == 0 ); }
+       bool IsFailedQC(void) const          { return ( (AlignmentFlag & QC_FAILED)     != 0 ); }
+       bool IsDuplicate(void) const         { return ( (AlignmentFlag & DUPLICATE)     != 0 ); }
+
+       // returns true and assigns the read group if present in the tag data
+       bool GetReadGroup(string& readGroup) const {
+
+               if(TagData.empty()) return false;
+
+               // localize the tag data
+               char* pTagData = (char*)TagData.data();
+               const unsigned int tagDataLen = TagData.size();
+               unsigned int numBytesParsed = 0;
+
+               bool foundReadGroupTag = false;
+               while(numBytesParsed < tagDataLen) {
+
+                       const char* pTagType = pTagData;
+                       const char* pTagStorageType = pTagData + 2;
+                       pTagData       += 3;
+                       numBytesParsed += 3;
+
+                       // check the current tag
+                       if(strncmp(pTagType, "RG", 2) == 0) {
+                               foundReadGroupTag = true;
+                               break;
+                       }
+
+                       // get the storage class and find the next tag
+                       SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed);
+               }
+
+               // return if the read group tag was not present
+               if(!foundReadGroupTag) return false;
+
+               // assign the read group
+               const unsigned int readGroupLen = strlen(pTagData);
+               readGroup.resize(readGroupLen);
+               memcpy((char*)readGroup.data(), pTagData, readGroupLen);
+               return true;
+       }
+
+       // skips to the next tag
+       static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
+               switch(storageType) {
+                       case 'A':
+                       case 'c':
+                       case 'C':
+                               numBytesParsed++;
+                               pTagData++;
+                               break;
+                       case 's':
+                       case 'S':
+                       case 'f':
+                               numBytesParsed += 2;
+                               pTagData       += 2;
+                               break;
+                       case 'i':
+                       case 'I':
+                               numBytesParsed += 4;
+                               pTagData       += 4;
+                               break;
+                       case 'Z':
+                       case 'H':
+                               while(*pTagData) {
+                                       numBytesParsed++;
+                                       pTagData++;
+                               }
+                               break;
+                       default:
+                               printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);
+                               exit(1);
+               }
+       }
+
+       // data members
+public:
+       string       Name;           // read name
+       unsigned int Length;         // query length
+       string       QueryBases;     // original sequence ( produced from machine )
+       string       AlignedBases;   // aligned sequence ( with indels ) 
+       string       Qualities;      // FASTQ qualities ( still in ASCII characters )
+       string       TagData;        // contains the tag data (accessor methods will pull the requested information out)
+       unsigned int RefID;          // ID for reference sequence
+       unsigned int Position;       // position on reference sequence where alignment starts
+       unsigned int Bin;            // bin in BAM file where this alignment resides
+       unsigned int MapQuality;     // mapping quality 
+       unsigned int AlignmentFlag;  // see above for available queries
+       vector<CigarOp> CigarData;   // vector of CIGAR operations (length & type) )
+       unsigned int MateRefID;      // ID for reference sequence that mate was aligned to
+       unsigned int MatePosition;   // position that mate was aligned to
+       unsigned int InsertSize;     // mate pair insert size
+
+       // alignment flag query constants
+private:
+       enum { PAIRED        = 1,               // Alignment comes from paired-end data
+               PROPER_PAIR   = 2,              // Alignment passed paired-end resolution
+               UNMAPPED      = 4,              // Read is unmapped
+               MATE_UNMAPPED = 8,              // Mate is unmapped
+               REVERSE       = 16,             // Read is on reverse strand
+               MATE_REVERSE  = 32,             // Mate is on reverse strand
+               READ_1        = 64,             // This alignment is mate 1 of pair
+               READ_2        = 128,            // This alignment is mate 2 of pair
+               SECONDARY     = 256,            // This alignment is not the primary (best) alignment for read
+               QC_FAILED     = 512,            // Read did not pass prior quality control steps
+               DUPLICATE     = 1024            // Read is PCR duplicate
+       };
+};
+
+// commonly used vector in this library
+typedef vector< BamAlignment > BamAlignmentVector;
+
+#endif /* BAMALIGNMENT_H */