]> git.donarmstrong.com Git - bamtools.git/blobdiff - BamMultiReader.cpp
integration of SetRegion into BamMultiReader
[bamtools.git] / BamMultiReader.cpp
index 2d8580d1d96158d0afe133e87d57770503b0a22d..733087e3f089f9be74cc6e5e2160763eb43dbec1 100644 (file)
@@ -80,7 +80,7 @@ bool BamMultiReader::HasOpenReaders() {
     return alignments.size() > 0;
 }
 
-// get next alignment among all files (from specified region, if given)
+// get next alignment among all files
 bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
 
     // bail out if we are at EOF in all files, means no more alignments to process
@@ -92,24 +92,59 @@ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
     UpdateReferenceID();
 
     // our lowest alignment and reader will be at the front of our alignment index
-    BamAlignment* lowestAlignment = alignments.begin()->second.second;
-    BamReader* lowestReader = alignments.begin()->second.first;
+    BamAlignment* alignment = alignments.begin()->second.second;
+    BamReader* reader = alignments.begin()->second.first;
 
     // now that we have the lowest alignment in the set, save it by copy to our argument
-    nextAlignment = BamAlignment(*lowestAlignment);
+    nextAlignment = BamAlignment(*alignment);
 
     // remove this alignment index entry from our alignment index
     alignments.erase(alignments.begin());
 
     // and add another entry if we can get another alignment from the reader
-    if (lowestReader->GetNextAlignment(*lowestAlignment)) {
-        alignments.insert(make_pair(make_pair(lowestAlignment->RefID, lowestAlignment->Position), 
-                                    make_pair(lowestReader, lowestAlignment)));
+    if (reader->GetNextAlignment(*alignment)) {
+        alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+                                    make_pair(reader, alignment)));
     } else { // do nothing
         //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
     }
 
     return true;
+
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
+
+    // bail out if we are at EOF in all files, means no more alignments to process
+    if (!HasOpenReaders())
+        return false;
+
+    // when all alignments have stepped into a new target sequence, update our
+    // current reference sequence id
+    UpdateReferenceID();
+
+    // our lowest alignment and reader will be at the front of our alignment index
+    BamAlignment* alignment = alignments.begin()->second.second;
+    BamReader* reader = alignments.begin()->second.first;
+
+    // now that we have the lowest alignment in the set, save it by copy to our argument
+    nextAlignment = BamAlignment(*alignment);
+    //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
+
+    // remove this alignment index entry from our alignment index
+    alignments.erase(alignments.begin());
+
+    // and add another entry if we can get another alignment from the reader
+    if (reader->GetNextAlignmentCore(*alignment)) {
+        alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), 
+                                    make_pair(reader, alignment)));
+    } else { // do nothing
+        //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+    }
+
+    return true;
+
 }
 
 // jumps to specified region(refID, leftBound) in BAM files, returns success/fail
@@ -128,25 +163,55 @@ bool BamMultiReader::Jump(int refID, int position) {
             exit(1);
         }
     }
-    if (result) {
-        // Update Alignments
-        alignments.clear();
-        for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
-            BamReader* br = it->first;
-            BamAlignment* ba = it->second;
-            if (br->GetNextAlignment(*ba)) {
-                alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), 
-                                            make_pair(br, ba)));
-            } else {
-                // assume BamReader EOF
-            }
+    if (result) UpdateAlignments();
+    return result;
+}
+
+bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
+
+    BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
+
+    return SetRegion(region);
+
+}
+
+bool BamMultiReader::SetRegion(const BamRegion& region) {
+
+    Region = region;
+
+    bool result = true;
+    for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+        BamReader* reader = it->first;
+        result &= reader->SetRegion(region);
+        if (!result) {
+            cerr << "ERROR: could not set region " << reader->GetFilename() << " to " 
+                << region.LeftRefID << ":" << region.LeftPosition << ".." << region.RightRefID << ":" << region.RightPosition
+                << endl;
+            exit(1);
         }
     }
+    if (result) UpdateAlignments();
     return result;
+
+}
+
+void BamMultiReader::UpdateAlignments(void) {
+    // Update Alignments
+    alignments.clear();
+    for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+        BamReader* br = it->first;
+        BamAlignment* ba = it->second;
+        if (br->GetNextAlignment(*ba)) {
+            alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), 
+                                        make_pair(br, ba)));
+        } else {
+            // assume BamReader end of region / EOF
+        }
+    }
 }
 
 // opens BAM files
-void BamMultiReader::Open(const vector<string> filenames, bool openIndexes) {
+void BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode) {
     // for filename in filenames
     fileNames = filenames; // save filenames in our multireader
     for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
@@ -158,7 +223,11 @@ void BamMultiReader::Open(const vector<string> filenames, bool openIndexes) {
             reader->Open(filename); // for merging, jumping is disallowed
         }
         BamAlignment* alignment = new BamAlignment;
-        reader->GetNextAlignment(*alignment);
+        if (coreMode) {
+            reader->GetNextAlignmentCore(*alignment);
+        } else {
+            reader->GetNextAlignment(*alignment);
+        }
         readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
         alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
                                     make_pair(reader, alignment)));
@@ -204,12 +273,14 @@ bool BamMultiReader::CreateIndexes(void) {
 const string BamMultiReader::GetHeaderText(void) const {
 
     string mergedHeader = "";
+    map<string, bool> readGroups;
 
     // foreach extraction entry (each BAM file)
-    bool isFirstTime = true;
-    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
 
-        BamReader* reader = it->first;
+        map<string, bool> currentFileReadGroups;
+
+        BamReader* reader = rs->first;
 
         stringstream header(reader->GetHeaderText());
         vector<string> lines;
@@ -224,23 +295,41 @@ const string BamMultiReader::GetHeaderText(void) const {
             if ( headerLine.empty() ) { continue; }
 
             // if first file, save HD & SQ entries
-            if ( isFirstTime ) {
+            if ( rs == readers.begin() ) {
                 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
                     mergedHeader.append(headerLine.c_str());
                     mergedHeader.append(1, '\n');
                 }
             }
 
-            // (for all files) append RG entries
+            // (for all files) append RG entries if they are unique
             if ( headerLine.find("@RG") == 0 ) {
-                mergedHeader.append(headerLine.c_str() );
-                mergedHeader.append(1, '\n');
+                stringstream headerLineSs(headerLine);
+                string part, readGroupPart, readGroup;
+                while(std::getline(headerLineSs, part, '\t')) {
+                    stringstream partSs(part);
+                    string subtag;
+                    std::getline(partSs, subtag, ':');
+                    if (subtag == "ID") {
+                        std::getline(partSs, readGroup, ':');
+                        break;
+                    }
+                }
+                if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
+                    mergedHeader.append(headerLine.c_str() );
+                    mergedHeader.append(1, '\n');
+                    readGroups[readGroup] = true;
+                    currentFileReadGroups[readGroup] = true;
+                } else {
+                    // warn iff we are reading one file and discover duplicated @RG tags in the header
+                    // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
+                    if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
+                        cerr << "WARNING: duplicate @RG tag " << readGroup 
+                            << " entry in header of " << reader->GetFilename() << endl;
+                    }
+                }
             }
-
         }
-
-        // set iteration flag
-        isFirstTime = false;
     }
 
     // return merged header text