\usepackage{mathtools,cancel}
\renewcommand{\CancelColor}{\color{red}} %change cancel color to red
-\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.6\textwidth,0.2\textheight)%
+\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.2\textwidth,0.2\textheight)%
\begin{block}{#1}}{%
\end{block}\end{textblock*}}
\end{center}
}
+\begin{frame}{The Problem}
+\end{frame}
+
\begin{frame}{MinHash Algorithm}
\begin{columns}
\column{0.3\textwidth}
}
\item Hash k-mers (32/64bit)
\item Estimate Jaccard Index $J(A,B)$
- \only<3>{
+ \only<4>{
\begin{digression}{Estimating Jaccard Index}
$J(A,B) = \frac{|A \cap B|}{|A \cup B|}$
\begin{itemize}
\end{itemize}
\end{digression}
}
- \item<3-> How can we randomly sample?
- \item<3-> Properties of the hash!
+ \item<4-> How can we randomly sample?
+ \item<4-> Properties of the hash!
\end{itemize}
\end{columns}
\end{frame}
-\begin{frame}{Hash functions}
-
+\begin{frame}{Hash functions and their properties}
+ \begin{itemize}
+ \item A function which can map arbitrary sized data to output of
+ fixed size
+ \item They're used everywhere: caches, duplicate filtering,
+ cryptography, finding similar substrings, finding similar records,
+ etc.
+ \item Good hash functions are deterministic, \alert{uniform}, and
+ rarely produce collisions.
+ \item Cryptographic hash functions are non-invertible (SHA-512,
+ etc.)
+ \item Other hash functions may have continuity (IE, AAAB and AAAC
+ will hash to nearby values)
+ \item for example, the MD5 of “DON” is \Sexpr{digest::digest("DON","md5")} while “DO” is
+ \Sexpr{digest::digest("DO","md5")}
+ \end{itemize}
\end{frame}
-\begin{frame}{MurmerHash3 -- properties}
+\begin{frame}{MurmurHash3 -- properties}
+ \begin{itemize}
+ \item Simple
+ \item Good distribution
+ \item Good collision resistance
+ \item Good avalanche properties (single bit in the input changes the output significantly).
+ \only<2>{\begin{itemize}
+ \item 0 -> {\texttt \Sexpr{digest::digest(0,"sha1")}}
+ \item 1 -> {\texttt \Sexpr{digest::digest(1,"sha1")}}
+ \item 2 -> {\texttt \Sexpr{digest::digest(2,"sha1")}}
+ \end{itemize}}
+ \item<3-> Fast
+ \item<3-> Key questions:
+ \begin{enumerate}
+ \item Does it map DNA and protein sequences uniformly into output?
+ \item Does the non-random input distribution of DNA/protein
+ sequences expose pathologic behavior of the hash function?
+ \end{enumerate}
+ \item<3-> Partially answered by the smhasher test suite which
+ exposed issues with MurmurHash2 (which is ideally the
+ \href{http://www.phy.duke.edu/\%7Ergb/General/dieharder.php}{DieHarder}
+ of hash testing.)
+ \end{itemize}
\end{frame}
-\begin{frame}
+\begin{frame}{Why is this quick}
+ \begin{itemize}
+ \item Hashing is relatively fast
+ \item Only read the data once
+ \item Can be trivially parallized
+ \item Only need to keep the $k$ lowest hash values for each dataset
+ \item Much faster than string alignment
+ \end{itemize}
+ \includegraphics[width=\textwidth,height=0.4\textheight,keepaspectratio]{mash_minhash_paper/table_2.png}
+\end{frame}
+
+
+\begin{frame}{How does it stack up against ANI?}
+ \begin{columns}
+ \column{0.5\textwidth}
\includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png}
+ \column{0.5\textwidth}
+ \begin{itemize}
+ \item Sketch size $s$
+ \item k-mer size $k$
+ \item Mash D is the mash distance
+ \item ANI is the average nucleotide identity between genomes
+ \item Increasing $s$ improves match; $k$ seems to max near 27.
+ \item ANI only considers core genome, so at some point ANI and Mash
+ will diverge
+ \end{itemize}
+\end{columns}
+\end{frame}
+
+\begin{frame}{Mash error bounds with $k=21$}
+ \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/table_1.png}
+
\end{frame}
\begin{frame}
- \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png}
+ \begin{columns}
+ \column{0.65\textwidth}
+ \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png}
+ \column{0.35\textwidth}
+ \begin{itemize}
+ \item \textit{De novo} clustering of all RefSeq genomes
+ \item Each node is a genome
+ \item Nodes are connected if Mash distance $\le 0.05$ and
+ $p \le 10^{-10}$
+ \end{itemize}
+ \end{columns}
\end{frame}
\begin{frame}
\includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png}
+
+ Primate species trees calculated by mash (right) in comparison to UCSC genome browser trees (left)
\end{frame}
\begin{frame}
+ \begin{columns}
+ \column{0.65\textwidth}
\includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png}
+ \column{0.35\textwidth}
+ \only<1>{
+ \begin{itemize}
+ \item Comparison between mash and COMMET using Global ocean
+ survey clustering
+ \item Mostly reproduces the clusters seen in the original study
+ \end{itemize}
+ }
+ \only<2>{
+ \begin{itemize}
+ \item Comparison of sequences and assemblies from 888 sequencing
+ runs and 879 assemblies from MHP and MetaHit projects
+ \item Mostly reproduces the clusters seen in the original study
+ \item COMMET would have required 140,000 hours to run this
+ analysis; mash did it in 25 hours or so.
+ \end{itemize}
+
+ }
+ \end{columns}
+
+
+\end{frame}
+
+\begin{frame}{Conclusions}
+\begin{itemize}
+
+\item Compare genomes on a massive scale which would be impossible for
+ typical techniques
+\item Good for rapid triaging of sequencing data to identify outliers
+\item Metagenomics and identification of samples
+\item Caveats
+\begin{itemize}
+\item Based on k-mer sketch; possibility of batch effects
+\item Phylogeny reconstructions are approximate, and do not track
+ mutation rates, etc.
+\item What about deconvolution of mixed samples? Probably need different techniques.
+\end{itemize}
+\end{itemize}
\end{frame}