From 8cd2d72381aff7fa4e96d4588cd44b6058724347 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Thu, 23 Jun 2016 11:47:42 -0700 Subject: [PATCH] add changes to presentation --- hpcbio_mash_minhash_jun_2016.Rnw | 139 +++++++++++++++++++++++++++++-- mash_minhash_paper/Makefile | 6 ++ 2 files changed, 136 insertions(+), 9 deletions(-) diff --git a/hpcbio_mash_minhash_jun_2016.Rnw b/hpcbio_mash_minhash_jun_2016.Rnw index 9b13264..95c0740 100644 --- a/hpcbio_mash_minhash_jun_2016.Rnw +++ b/hpcbio_mash_minhash_jun_2016.Rnw @@ -32,7 +32,7 @@ \usepackage{mathtools,cancel} \renewcommand{\CancelColor}{\color{red}} %change cancel color to red -\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.6\textwidth,0.2\textheight)% +\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.2\textwidth,0.2\textheight)% \begin{block}{#1}}{% \end{block}\end{textblock*}} @@ -97,6 +97,9 @@ library("xtable") \end{center} } +\begin{frame}{The Problem} +\end{frame} + \begin{frame}{MinHash Algorithm} \begin{columns} \column{0.3\textwidth} @@ -122,7 +125,7 @@ library("xtable") } \item Hash k-mers (32/64bit) \item Estimate Jaccard Index $J(A,B)$ - \only<3>{ + \only<4>{ \begin{digression}{Estimating Jaccard Index} $J(A,B) = \frac{|A \cap B|}{|A \cup B|}$ \begin{itemize} @@ -134,33 +137,151 @@ library("xtable") \end{itemize} \end{digression} } - \item<3-> How can we randomly sample? - \item<3-> Properties of the hash! + \item<4-> How can we randomly sample? + \item<4-> Properties of the hash! \end{itemize} \end{columns} \end{frame} -\begin{frame}{Hash functions} - +\begin{frame}{Hash functions and their properties} + \begin{itemize} + \item A function which can map arbitrary sized data to output of + fixed size + \item They're used everywhere: caches, duplicate filtering, + cryptography, finding similar substrings, finding similar records, + etc. + \item Good hash functions are deterministic, \alert{uniform}, and + rarely produce collisions. + \item Cryptographic hash functions are non-invertible (SHA-512, + etc.) + \item Other hash functions may have continuity (IE, AAAB and AAAC + will hash to nearby values) + \item for example, the MD5 of “DON” is \Sexpr{digest::digest("DON","md5")} while “DO” is + \Sexpr{digest::digest("DO","md5")} + \end{itemize} \end{frame} -\begin{frame}{MurmerHash3 -- properties} +\begin{frame}{MurmurHash3 -- properties} + \begin{itemize} + \item Simple + \item Good distribution + \item Good collision resistance + \item Good avalanche properties (single bit in the input changes the output significantly). + \only<2>{\begin{itemize} + \item 0 -> {\texttt \Sexpr{digest::digest(0,"sha1")}} + \item 1 -> {\texttt \Sexpr{digest::digest(1,"sha1")}} + \item 2 -> {\texttt \Sexpr{digest::digest(2,"sha1")}} + \end{itemize}} + \item<3-> Fast + \item<3-> Key questions: + \begin{enumerate} + \item Does it map DNA and protein sequences uniformly into output? + \item Does the non-random input distribution of DNA/protein + sequences expose pathologic behavior of the hash function? + \end{enumerate} + \item<3-> Partially answered by the smhasher test suite which + exposed issues with MurmurHash2 (which is ideally the + \href{http://www.phy.duke.edu/\%7Ergb/General/dieharder.php}{DieHarder} + of hash testing.) + \end{itemize} \end{frame} -\begin{frame} +\begin{frame}{Why is this quick} + \begin{itemize} + \item Hashing is relatively fast + \item Only read the data once + \item Can be trivially parallized + \item Only need to keep the $k$ lowest hash values for each dataset + \item Much faster than string alignment + \end{itemize} + \includegraphics[width=\textwidth,height=0.4\textheight,keepaspectratio]{mash_minhash_paper/table_2.png} +\end{frame} + + +\begin{frame}{How does it stack up against ANI?} + \begin{columns} + \column{0.5\textwidth} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png} + \column{0.5\textwidth} + \begin{itemize} + \item Sketch size $s$ + \item k-mer size $k$ + \item Mash D is the mash distance + \item ANI is the average nucleotide identity between genomes + \item Increasing $s$ improves match; $k$ seems to max near 27. + \item ANI only considers core genome, so at some point ANI and Mash + will diverge + \end{itemize} +\end{columns} +\end{frame} + +\begin{frame}{Mash error bounds with $k=21$} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/table_1.png} + \end{frame} \begin{frame} - \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png} + \begin{columns} + \column{0.65\textwidth} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png} + \column{0.35\textwidth} + \begin{itemize} + \item \textit{De novo} clustering of all RefSeq genomes + \item Each node is a genome + \item Nodes are connected if Mash distance $\le 0.05$ and + $p \le 10^{-10}$ + \end{itemize} + \end{columns} \end{frame} \begin{frame} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png} + + Primate species trees calculated by mash (right) in comparison to UCSC genome browser trees (left) \end{frame} \begin{frame} + \begin{columns} + \column{0.65\textwidth} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png} + \column{0.35\textwidth} + \only<1>{ + \begin{itemize} + \item Comparison between mash and COMMET using Global ocean + survey clustering + \item Mostly reproduces the clusters seen in the original study + \end{itemize} + } + \only<2>{ + \begin{itemize} + \item Comparison of sequences and assemblies from 888 sequencing + runs and 879 assemblies from MHP and MetaHit projects + \item Mostly reproduces the clusters seen in the original study + \item COMMET would have required 140,000 hours to run this + analysis; mash did it in 25 hours or so. + \end{itemize} + + } + \end{columns} + + +\end{frame} + +\begin{frame}{Conclusions} +\begin{itemize} + +\item Compare genomes on a massive scale which would be impossible for + typical techniques +\item Good for rapid triaging of sequencing data to identify outliers +\item Metagenomics and identification of samples +\item Caveats +\begin{itemize} +\item Based on k-mer sketch; possibility of batch effects +\item Phylogeny reconstructions are approximate, and do not track + mutation rates, etc. +\item What about deconvolution of mixed samples? Probably need different techniques. +\end{itemize} +\end{itemize} \end{frame} diff --git a/mash_minhash_paper/Makefile b/mash_minhash_paper/Makefile index 3fd0220..b3f233e 100644 --- a/mash_minhash_paper/Makefile +++ b/mash_minhash_paper/Makefile @@ -15,6 +15,12 @@ paper.pdf: paper_frontpage.png: paper.pdf convert -density 300 -depth 8 -quality 85 -crop '2100x1600+200+100' '$<[0]' $@ +table_1.png: paper.pdf + convert -density 300 -depth 8 -quality 85 -crop '2100x830+200+350' '$<[3]' $@ + +table_2.png: paper.pdf + convert -density 300 -depth 8 -quality 85 -crop '2100x550+200+2550' '$<[3]' $@ + fig_%.gif: wget -O $@ "https://static-content.springer.com/image/art%3A10.1186%2Fs13059-016-0997-x/MediaObjects/13059_2016_997_Fig$(*)_HTML.gif" -- 2.39.2