]> git.donarmstrong.com Git - bamtools.git/commitdiff
Added unique-alignment checks for ResolveTool
authorderek <derekwbarnett@gmail.com>
Tue, 28 Jun 2011 16:31:25 +0000 (12:31 -0400)
committerderek <derekwbarnett@gmail.com>
Tue, 28 Jun 2011 16:31:25 +0000 (12:31 -0400)
 * Unique-ness determined by comparing MapQuality to 0
 * Only pairs with both mates unique are used for the 'makeStats' median
fragment size calculation.

src/toolkit/bamtools_resolve.cpp
src/toolkit/bamtools_resolve.h

index 97eefcf18c8c0f1354436eaa75a975d267dce0c4..a873d5f4b7870ed14a840d846a45058d7ca0d5f3 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 14 June 2011
+// Last modified: 28 June 2011
 // ---------------------------------------------------------------------------
 // Resolves paired-end reads (marking the IsProperPair flag as needed).
 // ***************************************************************************
@@ -19,6 +19,7 @@ using namespace BamTools;
 #include <algorithm>
 #include <cassert>
 #include <cctype>
+#include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
@@ -71,6 +72,13 @@ static const string OPTION_FORCEMARKREADGROUPS  = "ForceMarkReadGroups";
 static const string RG_FIELD_DESCRIPTION =
     "#<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>";
 
+// --------------------------------------------------------------------------
+// unique readname file constants
+// --------------------------------------------------------------------------
+
+static const string READNAME_FILE_SUFFIX = ".uniq_names.txt";
+static const string DEFAULT_READNAME_FILE = "bt_resolve_TEMP" + READNAME_FILE_SUFFIX;
+
 // --------------------------------------------------------------------------
 // ModelType implementation
 
@@ -146,6 +154,7 @@ struct ReadGroupResolver {
     bool IsAmbiguous;
     bool HasData;
     vector<ModelType> Models;
+    map<string, bool> ReadNames;
 
     // ctor
     ReadGroupResolver(void);
@@ -292,6 +301,7 @@ struct ResolveTool::ResolveSettings {
     string InputBamFilename;
     string OutputBamFilename;
     string StatsFilename;
+    string ReadNamesFilename; //  ** N.B. - Only used internally, not set from cmdline **
 
     // 'resolve options'
     double ConfidenceInterval;
@@ -312,12 +322,121 @@ struct ResolveTool::ResolveSettings {
         , InputBamFilename(Options::StandardIn())
         , OutputBamFilename(Options::StandardOut())
         , StatsFilename("")
+        , ReadNamesFilename(DEFAULT_READNAME_FILE)
         , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
         , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
         , HasForceMarkReadGroups(false)
     { }
 };
 
+// --------------------------------------------------------------------------
+// ReadNamesFileReader implementation
+
+struct ResolveTool::ReadNamesFileReader {
+
+    // ctor & dtor
+    ReadNamesFileReader(void) { }
+    ~ReadNamesFileReader(void) { Close(); }
+
+    // main reader interface
+    public:
+        void Close(void);
+        bool Open(const string& filename);
+        bool Read(map<string, ReadGroupResolver>& readGroups);
+
+    // data members
+    private:
+        ifstream m_stream;
+};
+
+void ResolveTool::ReadNamesFileReader::Close(void) {
+    if ( m_stream.is_open() )
+        m_stream.close();
+}
+
+bool ResolveTool::ReadNamesFileReader::Open(const string& filename) {
+
+    // make sure stream is fresh
+    Close();
+
+    // attempt to open filename, return status
+    m_stream.open(filename.c_str(), ifstream::in);
+    return m_stream.good();
+}
+
+bool ResolveTool::ReadNamesFileReader::Read(map<string, ReadGroupResolver>& readGroups) {
+
+    // up-front sanity check
+    if ( !m_stream.is_open() ) return false;
+
+    // parse read names file
+    string line;
+    vector<string> fields;
+    map<string, ReadGroupResolver>::iterator rgIter;
+    map<string, ReadGroupResolver>::iterator rgEnd = readGroups.end();
+    while ( getline(m_stream, line) ) {
+
+        // skip if empty line
+        if ( line.empty() ) continue;
+
+        // split line on '\t'
+        fields = Utilities::Split(line, TAB_CHAR);
+        if ( fields.size() != 2 ) continue;
+
+        // look up resolver for read group
+        rgIter = readGroups.find( fields[0] );
+        if ( rgIter == rgEnd ) return false;
+        ReadGroupResolver& resolver = (*rgIter).second;
+
+        // store read name with resolver
+        resolver.ReadNames.insert( make_pair<string,bool>(fields[1], true) ) ;
+    }
+
+    // if here, return success
+    return true;
+}
+
+// --------------------------------------------------------------------------
+// ReadNamesFileWriter implementation
+
+struct ResolveTool::ReadNamesFileWriter {
+
+    // ctor & dtor
+    ReadNamesFileWriter(void) { }
+    ~ReadNamesFileWriter(void) { Close(); }
+
+    // main reader interface
+    public:
+        void Close(void);
+        bool Open(const string& filename);
+        void Write(const string& readGroupName, const string& readName);
+
+    // data members
+    private:
+        ofstream m_stream;
+};
+
+void ResolveTool::ReadNamesFileWriter::Close(void) {
+    if ( m_stream.is_open() )
+        m_stream.close();
+}
+
+bool ResolveTool::ReadNamesFileWriter::Open(const string& filename) {
+
+    // make sure stream is fresh
+    Close();
+
+    // attempt to open filename, return status
+    m_stream.open(filename.c_str(), ofstream::out);
+    return m_stream.good();
+}
+
+void ResolveTool::ReadNamesFileWriter::Write(const string& readGroupName,
+                                             const string& readName)
+{
+    m_stream << readGroupName << TAB_CHAR << readName << endl;
+}
+
 // --------------------------------------------------------------------------
 // StatsFileReader implementation
 
@@ -331,7 +450,7 @@ struct ResolveTool::StatsFileReader {
     // main reader interface
     public:
         void Close(void);
-        bool Open(const std::string& filename);
+        bool Open(const string& filename);
         bool Read(ResolveTool::ResolveSettings* settings,
                   map<string, ReadGroupResolver>& readGroups);
 
@@ -538,7 +657,7 @@ struct ResolveTool::StatsFileWriter {
     // main reader interface
     public:
         void Close(void);
-        bool Open(const std::string& filename);
+        bool Open(const string& filename);
         bool Write(ResolveTool::ResolveSettings* settings,
                    const map<string, ReadGroupResolver>& readGroups);
 
@@ -779,22 +898,32 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
     ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
 
     // open our BAM reader
-    BamReader reader;
-    if ( !reader.Open(m_settings->InputBamFilename) ) {
+    BamReader bamReader;
+    if ( !bamReader.Open(m_settings->InputBamFilename) ) {
         cerr << "bamtools resolve ERROR: could not open input BAM file: "
              << m_settings->InputBamFilename << endl;
         return false;
     }
 
     // retrieve header & parse for read groups
-    const SamHeader& header = reader.GetHeader();
+    const SamHeader& header = bamReader.GetHeader();
     ParseHeader(header);
 
+    // open ReadNamesFileWriter
+    ResolveTool::ReadNamesFileWriter readNamesWriter;
+    if ( !readNamesWriter.Open(m_settings->ReadNamesFilename) ) {
+        cerr << "bamtools resolve ERROR: could not open (temp) output read names file: "
+             << m_settings->ReadNamesFilename << endl;
+        bamReader.Close();
+        return false;
+    }
+
     // read through BAM file
     BamAlignment al;
     string readGroup("");
     map<string, ReadGroupResolver>::iterator rgIter;
-    while ( reader.GetNextAlignmentCore(al) ) {
+    map<string, bool>::iterator readNameIter;
+    while ( bamReader.GetNextAlignmentCore(al) ) {
 
         // skip if alignment is not paired, mapped, nor mate is mapped
         if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() )
@@ -803,17 +932,6 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
         // skip if alignment & mate not on same reference sequence
         if ( al.RefID != al.MateRefID ) continue;
 
-        // skip if map quality is 0
-        if ( al.MapQuality == 0 ) continue;
-
-        // skip if insert size is less than (we'll count its mate)
-        // or equal to zero (single-end or missing data)
-        if ( al.InsertSize <= 0 ) continue;
-
-        // determine model type, skip if model unknown
-        const uint16_t currentModelType = CalculateModelType(al);
-        assert( currentModelType != ModelType::DUMMY_ID );
-
         // flesh out the char data, so we can retrieve its read group ID
         al.BuildCharData();
 
@@ -826,15 +944,45 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
         if ( rgIter == m_readGroups.end() )  {
             cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
                  << readGroup << endl;
-            reader.Close();
+            bamReader.Close();
             return false;
         }
         ReadGroupResolver& resolver = (*rgIter).second;
 
-        // update stats for current read group, current model type
-        resolver.Models[currentModelType].push_back(al.InsertSize);
+        // determine unique-ness of current alignment
+        const bool isCurrentMateUnique = ( al.MapQuality != 0 );
+
+        // look up read name
+        readNameIter = resolver.ReadNames.find(al.Name);
+
+        // if read name found (current alignment's mate already parsed)
+        if ( readNameIter != resolver.ReadNames.end() ) {
+
+            // if both unique mates are unique, store read name & insert size for later
+            const bool isStoredMateUnique  = (*readNameIter).second;
+            if ( isCurrentMateUnique && isStoredMateUnique ) {
+
+                // save read name in temp file as candidates for later pair marking
+                readNamesWriter.Write(readGroup, al.Name);
+
+                // determine model type & store fragment length for stats calculation
+                const uint16_t currentModelType = CalculateModelType(al);
+                assert( currentModelType != ModelType::DUMMY_ID );
+                resolver.Models[currentModelType].push_back( abs(al.InsertSize) );
+            }
+
+            // unique or not, remove read name from map
+            resolver.ReadNames.erase(readNameIter);
+        }
+
+        // if read name not found, store new entry
+        else resolver.ReadNames.insert( make_pair<string, bool>(al.Name, isCurrentMateUnique) );
     }
 
+    // close files
+    readNamesWriter.Close();
+    bamReader.Close();
+
     // iterate back through read groups
     map<string, ReadGroupResolver>::iterator rgEnd  = m_readGroups.end();
     for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
@@ -843,10 +991,13 @@ bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
 
         // calculate acceptable orientation & insert sizes for this read group
         resolver.DetermineTopModels(name);
+
+        // clear out left over read names
+        // (these have mates that did not pass filters or were already removed as non-unique)
+        resolver.ReadNames.clear();
     }
 
-    // close reader & return success
-    reader.Close();
+    // if we get here, return success
     return true;
 }
 
@@ -923,12 +1074,41 @@ void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
     // quit check if pairs are not within "reasonable" distance (can differ for each RG)
     if ( !resolver.IsValidInsertSize(al) ) return;
 
+    // quit check if alignment is not a "candidate proper pair"
+    map<string, bool>::const_iterator readNameIter;
+    readNameIter = resolver.ReadNames.find(al.Name);
+    if ( readNameIter == resolver.ReadNames.end() )
+        return;
+
     // if we get here, alignment is OK - set 'proper pair' flag
     al.SetIsProperPair(true);
 }
 
 bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) {
 
+    // open file containing read names of candidate proper pairs
+    ResolveTool::ReadNamesFileReader readNamesReader;
+    if ( !readNamesReader.Open(m_settings->ReadNamesFilename) ) {
+        cerr << "bamtools resolve ERROR: could not open (temp) inputput read names file: "
+             << m_settings->ReadNamesFilename << endl;
+        return false;
+    }
+
+    // parse read names (matching with corresponding read groups)
+    if ( !readNamesReader.Read(m_readGroups) ) {
+        cerr << "bamtools resolve ERROR: could not read candidate read names from file: "
+             << m_settings->ReadNamesFilename << endl;
+        readNamesReader.Close();
+        return false;
+    }
+
+    // close read name file reader & delete temp file
+    readNamesReader.Close();
+    if ( remove(m_settings->ReadNamesFilename.c_str()) != 0 ) {
+        cerr << "bamtools resolve WARNING: could not delete temp file: "
+             << m_settings->ReadNamesFilename << endl;
+    }
+
     // open our BAM reader
     BamReader reader;
     if ( !reader.Open(m_settings->InputBamFilename) ) {
@@ -987,6 +1167,12 @@ bool ResolveTool::ResolveToolPrivate::Run(void) {
     // initialize read group map with default (empty name) read group
     m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
 
+    // init readname filename
+    // uses (adjusted) stats filename if provided (req'd for makeStats, markPairs modes; optional for twoPass)
+    // else keep default filename
+    if ( m_settings->HasStatsFile )
+        m_settings->ReadNamesFilename = m_settings->StatsFilename + READNAME_FILE_SUFFIX;
+
     // -makeStats mode
     if ( m_settings->IsMakeStats ) {
 
index 109a89f53b9f6bd1bfb9779c4dc68326d309a06c..681a557de4a801dd07d7f91dea4d1af9e68a55c0 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 11 June 2011
+// Last modified: 23 June 2011
 // ---------------------------------------------------------------------------
 // Resolves paired-end reads (marking the IsProperPair flag as needed).
 // ***************************************************************************
@@ -32,6 +32,8 @@ class ResolveTool : public AbstractTool {
         struct ResolveToolPrivate;
         ResolveToolPrivate* m_impl;
 
+        struct ReadNamesFileReader;
+        struct ReadNamesFileWriter;
         struct StatsFileReader;
         struct StatsFileWriter;
 };