add changes to presentation

[mash_minhash_presentation.git] / hpcbio_mash_minhash_jun_2016.Rnw
diff --git a/hpcbio_mash_minhash_jun_2016.Rnw b/hpcbio_mash_minhash_jun_2016.Rnw

index 9b132641dcce6c9e4cd85424204ef9da27d1d4c8..95c074027ebefe7037266bb56f20c037d986341f 100644 (file)
--- a/hpcbio_mash_minhash_jun_2016.Rnw
+++ b/hpcbio_mash_minhash_jun_2016.Rnw
@@ -32,7 +32,7 @@
  \usepackage{mathtools,cancel}
  
  \renewcommand{\CancelColor}{\color{red}} %change cancel color to red
-\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.6\textwidth,0.2\textheight)%
+\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.2\textwidth,0.2\textheight)%
      \begin{block}{#1}}{%
  \end{block}\end{textblock*}}
  
@@ -97,6 +97,9 @@ library("xtable")
  \end{center}
  }
  
+\begin{frame}{The Problem}
+\end{frame}
+
  \begin{frame}{MinHash Algorithm}
    \begin{columns}
      \column{0.3\textwidth}
@@ -122,7 +125,7 @@ library("xtable")
        }
      \item Hash k-mers (32/64bit)
      \item Estimate Jaccard Index $J(A,B)$
-      \only<3>{
+      \only<4>{
          \begin{digression}{Estimating Jaccard Index}
            $J(A,B) = \frac{|A \cap B|}{|A \cup B|}$ 
            \begin{itemize} 
@@ -134,33 +137,151 @@ library("xtable")
            \end{itemize}
          \end{digression}
        }
-    \item<3-> How can we randomly sample?
-    \item<3-> Properties of the hash!
+    \item<4-> How can we randomly sample?
+    \item<4-> Properties of the hash!
      \end{itemize}
    \end{columns}
  \end{frame}
  
-\begin{frame}{Hash functions}
-  
+\begin{frame}{Hash functions and their properties}
+  \begin{itemize}
+  \item A function which can map arbitrary sized data to output of
+    fixed size
+  \item They're used everywhere: caches, duplicate filtering,
+    cryptography, finding similar substrings, finding similar records,
+    etc.
+  \item Good hash functions are deterministic, \alert{uniform}, and
+    rarely produce collisions.
+  \item Cryptographic hash functions are non-invertible (SHA-512,
+    etc.)
+  \item Other hash functions may have continuity (IE, AAAB and AAAC
+    will hash to nearby values)
+  \item for example, the MD5 of “DON” is \Sexpr{digest::digest("DON","md5")} while “DO” is 
+    \Sexpr{digest::digest("DO","md5")}
+  \end{itemize}
  \end{frame}
  
-\begin{frame}{MurmerHash3 -- properties}
+\begin{frame}{MurmurHash3 -- properties}
+  \begin{itemize}
+  \item Simple
+  \item Good distribution
+  \item Good collision resistance
+  \item Good avalanche properties (single bit in the input changes the output significantly).
+    \only<2>{\begin{itemize}
+    \item 0 -> {\texttt \Sexpr{digest::digest(0,"sha1")}}
+    \item 1 -> {\texttt \Sexpr{digest::digest(1,"sha1")}}
+    \item 2 -> {\texttt \Sexpr{digest::digest(2,"sha1")}}
+    \end{itemize}}
+  \item<3-> Fast
+  \item<3-> Key questions:
+    \begin{enumerate}
+    \item Does it map DNA and protein sequences uniformly into output?
+    \item Does the non-random input distribution of DNA/protein
+      sequences expose pathologic behavior of the hash function?
+    \end{enumerate}
+  \item<3-> Partially answered by the smhasher test suite which
+    exposed issues with MurmurHash2 (which is ideally the
+    \href{http://www.phy.duke.edu/\%7Ergb/General/dieharder.php}{DieHarder}
+    of hash testing.)
+  \end{itemize}
  \end{frame}
  
-\begin{frame}
+\begin{frame}{Why is this quick}
+  \begin{itemize}
+  \item Hashing is relatively fast
+  \item Only read the data once
+  \item Can be trivially parallized
+  \item Only need to keep the $k$ lowest hash values for each dataset
+  \item Much faster than string alignment
+  \end{itemize}
+  \includegraphics[width=\textwidth,height=0.4\textheight,keepaspectratio]{mash_minhash_paper/table_2.png}
+\end{frame}
+
+
+\begin{frame}{How does it stack up against ANI?}
+  \begin{columns}
+    \column{0.5\textwidth}
    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png}
+    \column{0.5\textwidth}
+  \begin{itemize}
+  \item Sketch size $s$
+  \item k-mer size $k$
+  \item Mash D is the mash distance
+  \item ANI is the average nucleotide identity between genomes
+  \item Increasing $s$ improves match; $k$ seems to max near 27.
+  \item ANI only considers core genome, so at some point ANI and Mash
+    will diverge
+  \end{itemize}
+\end{columns}
+\end{frame}
+
+\begin{frame}{Mash error bounds with $k=21$}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/table_1.png}
+  
  \end{frame}
  
  \begin{frame}
-  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png}
+  \begin{columns}
+    \column{0.65\textwidth}
+    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png}
+    \column{0.35\textwidth}
+    \begin{itemize}
+    \item \textit{De novo} clustering of all RefSeq genomes
+    \item Each node is a genome
+    \item Nodes are connected if Mash distance $\le 0.05$ and
+      $p \le 10^{-10}$
+    \end{itemize}
+  \end{columns}
  \end{frame}
  
  \begin{frame}
    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png}
+
+  Primate species trees calculated by mash (right) in comparison to UCSC genome browser trees (left)
  \end{frame}
  
  \begin{frame}
+  \begin{columns}
+    \column{0.65\textwidth}
    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png}
+    \column{0.35\textwidth}
+    \only<1>{
+      \begin{itemize}
+      \item Comparison between mash and COMMET using Global ocean
+        survey clustering
+      \item Mostly reproduces the clusters seen in the original study
+      \end{itemize}
+    }
+    \only<2>{
+      \begin{itemize}
+      \item Comparison of sequences and assemblies from 888 sequencing
+        runs and 879 assemblies from MHP and MetaHit projects
+      \item Mostly reproduces the clusters seen in the original study
+      \item COMMET would have required 140,000 hours to run this
+        analysis; mash did it in 25 hours or so.
+      \end{itemize}
+      
+    }
+  \end{columns}
+
+  
+\end{frame}
+
+\begin{frame}{Conclusions}
+\begin{itemize}
+  
+\item Compare genomes on a massive scale which would be impossible for
+  typical techniques
+\item Good for rapid triaging of sequencing data to identify outliers
+\item Metagenomics and identification of samples
+\item Caveats
+\begin{itemize}
+\item Based on k-mer sketch; possibility of batch effects
+\item Phylogeny reconstructions are approximate, and do not track
+  mutation rates, etc.
+\item What about deconvolution of mixed samples? Probably need different techniques.
+\end{itemize}
+\end{itemize}
  \end{frame}