]> git.donarmstrong.com Git - bamtools.git/blobdiff - BamMultiReader.cpp
further cleanup of duplicate @RG tag warning reporting
[bamtools.git] / BamMultiReader.cpp
index 2d8580d1d96158d0afe133e87d57770503b0a22d..51372c3feef1d3bf8a73d4c93ade22b227411a3e 100644 (file)
@@ -204,12 +204,14 @@ bool BamMultiReader::CreateIndexes(void) {
 const string BamMultiReader::GetHeaderText(void) const {
 
     string mergedHeader = "";
+    map<string, bool> readGroups;
 
     // foreach extraction entry (each BAM file)
-    bool isFirstTime = true;
-    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
 
-        BamReader* reader = it->first;
+        map<string, bool> currentFileReadGroups;
+
+        BamReader* reader = rs->first;
 
         stringstream header(reader->GetHeaderText());
         vector<string> lines;
@@ -224,23 +226,41 @@ const string BamMultiReader::GetHeaderText(void) const {
             if ( headerLine.empty() ) { continue; }
 
             // if first file, save HD & SQ entries
-            if ( isFirstTime ) {
+            if ( rs == readers.begin() ) {
                 if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
                     mergedHeader.append(headerLine.c_str());
                     mergedHeader.append(1, '\n');
                 }
             }
 
-            // (for all files) append RG entries
+            // (for all files) append RG entries if they are unique
             if ( headerLine.find("@RG") == 0 ) {
-                mergedHeader.append(headerLine.c_str() );
-                mergedHeader.append(1, '\n');
+                stringstream headerLineSs(headerLine);
+                string part, readGroupPart, readGroup;
+                while(std::getline(headerLineSs, part, '\t')) {
+                    stringstream partSs(part);
+                    string subtag;
+                    std::getline(partSs, subtag, ':');
+                    if (subtag == "ID") {
+                        std::getline(partSs, readGroup, ':');
+                        break;
+                    }
+                }
+                if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
+                    mergedHeader.append(headerLine.c_str() );
+                    mergedHeader.append(1, '\n');
+                    readGroups[readGroup] = true;
+                    currentFileReadGroups[readGroup] = true;
+                } else {
+                    // warn iff we are reading one file and discover duplicated @RG tags in the header
+                    // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
+                    if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
+                        cerr << "WARNING: duplicate @RG tag " << readGroup 
+                            << " entry in header of " << reader->GetFilename() << endl;
+                    }
+                }
             }
-
         }
-
-        // set iteration flag
-        isFirstTime = false;
     }
 
     // return merged header text