]> git.donarmstrong.com Git - bamtools.git/blobdiff - BamReader.cpp
change merger to use GetNextAlignmentCore
[bamtools.git] / BamReader.cpp
index a2f975f9659032bed75b5cb280b12a97e97b94df..e5265e0ca0446318ee9e3955a4db2defbe3aefa4 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 14 April 2010 (DB)\r
+// Last modified: 8 June 2010 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
 // Institute.\r
 using namespace BamTools;\r
 using namespace std;\r
 \r
-namespace BamTools {\r
-  struct BamAlignmentSupportData {\r
-      string   AllCharData;\r
-      uint32_t BlockLength;\r
-      uint32_t NumCigarOperations;\r
-      uint32_t QueryNameLength;\r
-      uint32_t QuerySequenceLength;\r
-  };\r
-} // namespace BamTools\r
-\r
 struct BamReader::BamReaderPrivate {\r
 \r
     // -------------------------------\r
@@ -48,7 +38,7 @@ struct BamReader::BamReaderPrivate {
     int64_t   AlignmentsBeginOffset;\r
     string    Filename;\r
     string    IndexFilename;\r
-    
+    \r
     // system data\r
     bool IsBigEndian;\r
 \r
@@ -71,7 +61,7 @@ struct BamReader::BamReaderPrivate {
     // "public" interface\r
     // -------------------------------\r
 \r
-    // flie operations\r
+    // file operations\r
     void Close(void);\r
     bool Jump(int refID, int position = 0);\r
     void Open(const string& filename, const string& indexFilename = "");\r
@@ -79,6 +69,7 @@ struct BamReader::BamReaderPrivate {
 \r
     // access alignment data\r
     bool GetNextAlignment(BamAlignment& bAlignment);\r
+    bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
 \r
     // access auxiliary data\r
     int GetReferenceID(const string& refName) const;\r
@@ -95,7 +86,7 @@ struct BamReader::BamReaderPrivate {
     // calculate bins that overlap region ( left to reference end for now )\r
     int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]);\r
     // fills out character data for BamAlignment data\r
-    bool BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData);\r
+    bool BuildCharData(BamAlignment& bAlignment);\r
     // calculate file offset for first alignment chunk overlapping 'left'\r
     int64_t GetOffset(int refID, int left);\r
     // checks to see if alignment overlaps current region\r
@@ -103,7 +94,7 @@ struct BamReader::BamReaderPrivate {
     // retrieves header text from BAM file\r
     void LoadHeaderData(void);\r
     // retrieves BAM alignment under file pointer\r
-    bool LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData);\r
+    bool LoadNextAlignment(BamAlignment& bAlignment);\r
     // builds reference data structure from BAM file\r
     void LoadReferenceData(void);\r
 \r
@@ -148,12 +139,14 @@ bool BamReader::Rewind(void) { return d->Rewind(); }
 \r
 // access alignment data\r
 bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }\r
 \r
 // access auxiliary data\r
 const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
 int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
 const RefVector BamReader::GetReferenceData(void) const { return d->References; }\r
 int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
 \r
 // index operations\r
 bool BamReader::CreateIndex(void) { return d->CreateIndex(); }\r
@@ -203,40 +196,40 @@ int BamReader::BamReaderPrivate::BinsFromRegion(int refID, int left, uint16_t li
     return i;\r
 }\r
 \r
-bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData) {\r
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
   \r
     // calculate character lengths/offsets\r
-    const unsigned int dataLength     = supportData.BlockLength - BAM_CORE_SIZE;\r
-    const unsigned int seqDataOffset  = supportData.QueryNameLength + (supportData.NumCigarOperations * 4);\r
-    const unsigned int qualDataOffset = seqDataOffset + (supportData.QuerySequenceLength+1)/2;\r
-    const unsigned int tagDataOffset  = qualDataOffset + supportData.QuerySequenceLength;\r
+    const unsigned int dataLength     = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+    const unsigned int seqDataOffset  = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
+    const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
+    const unsigned int tagDataOffset  = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
     const unsigned int tagDataLength  = dataLength - tagDataOffset;\r
       \r
     // set up char buffers\r
-    const char* allCharData = supportData.AllCharData.data();\r
+    const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
     const char* seqData     = ((const char*)allCharData) + seqDataOffset;\r
     const char* qualData    = ((const char*)allCharData) + qualDataOffset;\r
     char* tagData     = ((char*)allCharData) + tagDataOffset;\r
   \r
     // save query sequence\r
     bAlignment.QueryBases.clear();\r
-    bAlignment.QueryBases.reserve(supportData.QuerySequenceLength);\r
-    for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+    bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+    for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
         char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
         bAlignment.QueryBases.append(1, singleBase);\r
     }\r
   \r
     // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
     bAlignment.Qualities.clear();\r
-    bAlignment.Qualities.reserve(supportData.QuerySequenceLength);\r
-    for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+    bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+    for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
         char singleQuality = (char)(qualData[i]+33);\r
         bAlignment.Qualities.append(1, singleQuality);\r
     }\r
     \r
     // parse CIGAR to build 'AlignedBases'\r
     bAlignment.AlignedBases.clear();\r
-    bAlignment.AlignedBases.reserve(supportData.QuerySequenceLength);\r
+    bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
     \r
     int k = 0;\r
     vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
@@ -330,6 +323,9 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment, const
     bAlignment.TagData.resize(tagDataLength);\r
     memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
     \r
+    // set support data parsed flag\r
+    bAlignment.SupportData.IsParsed = true;\r
+    \r
     // return success\r
     return true;\r
 }\r
@@ -511,30 +507,56 @@ int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {
 // get next alignment (from specified region, if given)\r
 bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
 \r
-    BamAlignmentSupportData supportData;\r
-  \r
     // if valid alignment available\r
-    if ( LoadNextAlignment(bAlignment, supportData) ) {\r
+    if ( LoadNextAlignment(bAlignment) ) {\r
 \r
         // if region not specified, return success\r
         if ( !IsRegionSpecified ) { \r
-          bool ok = BuildCharData(bAlignment, supportData);\r
+          bool ok = BuildCharData(bAlignment);\r
           return ok; \r
         }\r
 \r
         // load next alignment until region overlap is found\r
         while ( !IsOverlap(bAlignment) ) {\r
             // if no valid alignment available (likely EOF) return failure\r
-            if ( !LoadNextAlignment(bAlignment, supportData) ) { return false; }\r
+            if ( !LoadNextAlignment(bAlignment) ) return false;\r
         }\r
 \r
         // return success (alignment found that overlaps region)\r
-        bool ok = BuildCharData(bAlignment, supportData);\r
+        bool ok = BuildCharData(bAlignment);\r
         return ok;\r
     }\r
 \r
     // no valid alignment\r
-    else { return false; }\r
+    else \r
+        return false;\r
+}\r
+\r
+// retrieves next available alignment core data (returns success/fail)\r
+// ** DOES NOT parse any character data (bases, qualities, tag data)\r
+//    these can be accessed, if necessary, from the supportData \r
+// useful for operations requiring ONLY positional or other alignment-related information\r
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {\r
+\r
+    // if valid alignment available\r
+    if ( LoadNextAlignment(bAlignment) ) {\r
+\r
+        // if region not specified, return success\r
+        if ( !IsRegionSpecified ) return true;\r
+\r
+        // load next alignment until region overlap is found\r
+        while ( !IsOverlap(bAlignment) ) {\r
+            // if no valid alignment available (likely EOF) return failure\r
+            if ( !LoadNextAlignment(bAlignment) ) return false;\r
+        }\r
+\r
+        // return success (alignment found that overlaps region)\r
+        return true;\r
+    }\r
+\r
+    // no valid alignment\r
+    else\r
+        return false;\r
 }\r
 \r
 // calculate closest indexed file offset for region specified\r
@@ -614,7 +636,7 @@ void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets
                                                      const uint64_t&     lastOffset)\r
 {\r
     // get converted offsets\r
-    int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
+    int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;\r
     int endOffset   = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;\r
 \r
     // resize vector if necessary\r
@@ -639,7 +661,7 @@ bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
     // read starts after left boundary\r
     if ( bAlignment.Position >= CurrentLeft) { return true; }\r
 \r
-    // return whether alignment end overlaps left boundary
+    // return whether alignment end overlaps left boundary\r
     return ( bAlignment.GetEndPosition() >= CurrentLeft );\r
 }\r
 \r
@@ -826,14 +848,14 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
 }\r
 \r
 // populates BamAlignment with alignment data under file pointer, returns success/fail\r
-bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) {\r
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
 \r
     // read in the 'block length' value, make sure it's not zero\r
     char buffer[4];\r
     mBGZF.Read(buffer, 4);\r
-    supportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
-    if ( IsBigEndian ) { SwapEndian_32(supportData.BlockLength); }\r
-    if ( supportData.BlockLength == 0 ) { return false; }\r
+    bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+    if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
+    if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
 \r
     // read in core alignment data, make sure the right size of data was read\r
     char x[BAM_CORE_SIZE];\r
@@ -852,20 +874,20 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment, Ba
     unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
     bAlignment.Bin        = tempValue >> 16;\r
     bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
-    supportData.QueryNameLength = tempValue & 0xff;\r
+    bAlignment.SupportData.QueryNameLength = tempValue & 0xff;\r
 \r
     tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
     bAlignment.AlignmentFlag = tempValue >> 16;\r
-    supportData.NumCigarOperations = tempValue & 0xffff;\r
+    bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;\r
 \r
-    supportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+    bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
     bAlignment.MateRefID    = BgzfData::UnpackSignedInt(&x[20]);\r
     bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
     bAlignment.InsertSize   = BgzfData::UnpackSignedInt(&x[28]);\r
     \r
     // store 'all char data' and cigar ops\r
-    const unsigned int dataLength      = supportData.BlockLength - BAM_CORE_SIZE;\r
-    const unsigned int cigarDataOffset = supportData.QueryNameLength;\r
+    const unsigned int dataLength      = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+    const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
     \r
     char*     allCharData = (char*)calloc(sizeof(char), dataLength);\r
     uint32_t* cigarData   = (uint32_t*)(allCharData + cigarDataOffset);\r
@@ -876,14 +898,14 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment, Ba
      \r
         // store alignment name and length\r
         bAlignment.Name.assign((const char*)(allCharData));\r
-        bAlignment.Length = supportData.QuerySequenceLength;\r
+        bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
       \r
         // store remaining 'allCharData' in supportData structure\r
-        supportData.AllCharData.assign((const char*)allCharData, dataLength);\r
+        bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
         \r
         // save CigarOps for BamAlignment\r
         bAlignment.CigarData.clear();\r
-        for (unsigned int i = 0; i < supportData.NumCigarOperations; ++i) {\r
+        for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
 \r
             // swap if necessary\r
             if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r