\documentclass[ignorenonframetext]{beamer} \usepackage{fontspec} \setmainfont{FreeSerif} \setsansfont{FreeSans} \setmonofont{FreeMono} \usepackage{url} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage[bf]{caption} \usepackage{rotating} \usepackage{wrapfig} \usepackage{fancybox} \usepackage{booktabs} % \usepackage{multirow} \usepackage{acronym} \usepackage{qrcode} \usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex} \addbibresource{references.bib} % \usepackage[nomargin,inline,draft]{fixme} % \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}} % \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref} \usepackage{texshade} \usepackage{tikz} \usepackage{nameref} \usepackage{zref-xr,zref-user} \renewcommand*{\bibfont}{\tiny} % The textpos package is necessary to position textblocks at arbitary % places on the page. Use showboxes option to show outlines of textboxes. % \usepackage[absolute]{textpos} \usepackage[absolute,overlay]{textpos} \usepackage{mathtools,cancel} \renewcommand{\CancelColor}{\color{red}} %change cancel color to red \newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.2\textwidth,0.2\textheight)% \begin{block}{#1}}{% \end{block}\end{textblock*}} \usepackage{multirow} \usepackage{array} \mode{ \usetheme{CambridgeUS} \usecolortheme{crane} % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html \definecolor{ilboldblue}{HTML}{002058} \definecolor{ilboldorange}{HTML}{E87722} \definecolor{ilblue}{HTML}{606EB2} \definecolor{ilorange}{HTML}{D45D00} \logo{\begin{tikzpicture}% Pale figure {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}% };}% \end{tikzpicture}} } \title[MASH]{Mash: fast genome and metagenome distance estimation} \author[Don Armstrong]{Don L. Armstrong} \institute[IGB]{Institute for Genomic Biology, Computing Genomes for Reproductive Health, University of Illinois, Urbana-Champaign} \begin{document} <>= opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio") #opts_chunk$set(cache=TRUE, autodep=TRUE) options(device = function(file, width = 8, height = 7, ...) { cairo_pdf(tempfile(), width = width, height = height, ...) }) options(digits=2) library("data.table") library("ggplot2") library("reshape2") library("grid") library("xtable") @ \IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png} \url{https://xkcd.com/1691/}}} \frame[plain]{\titlepage \begin{center} Code and slides are here: \qrcode[padding]{http://dla2.us/p/mashminhash2016} \url{http://dla2.us/p/mashminhash2016} \end{center} } \frame[plain]{ \begin{center} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/paper_frontpage} \end{center} } \begin{frame}{The Problem} \end{frame} \begin{frame}{MinHash Algorithm} \begin{columns} \column{0.3\textwidth} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_1.png} \column{0.7\textwidth} \begin{itemize} \item Decompose dataset into k-mers \only<2>{ \begin{digression}{What about strandedness} \begin{itemize} \item Take lexically lowest sequence \begin{itemize} \item Given 7-mers 5'-ACTGCAC-3' and its reverse complement, 5'-GTGCAGT-3' \item A $\lt$ G \item Use ACTGCAC \end{itemize} \item Because $S(A \cup B)$ is a random sample of $A \cup B$ the fraction of elements in $S(A \cup B)$ which are shared by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$ \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$ \end{itemize} \end{digression} } \item Hash k-mers (32/64bit) \item Estimate Jaccard Index $J(A,B)$ \only<4>{ \begin{digression}{Estimating Jaccard Index} $J(A,B) = \frac{|A \cap B|}{|A \cup B|}$ \begin{itemize} \item Sample randomly from $A$ and $B$ \item Because $S(A \cup B)$ is a random sample of $A \cup B$ the fraction of elements in $S(A \cup B)$ which are shared by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$ \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$ \end{itemize} \end{digression} } \item<4-> How can we randomly sample? \item<4-> Properties of the hash! \end{itemize} \end{columns} \end{frame} \begin{frame}{Hash functions and their properties} \begin{itemize} \item A function which can map arbitrary sized data to output of fixed size \item They're used everywhere: caches, duplicate filtering, cryptography, finding similar substrings, finding similar records, etc. \item Good hash functions are deterministic, \alert{uniform}, and rarely produce collisions. \item Cryptographic hash functions are non-invertible (SHA-512, etc.) \item Other hash functions may have continuity (IE, AAAB and AAAC will hash to nearby values) \item for example, the MD5 of “DON” is \Sexpr{digest::digest("DON","md5")} while “DO” is \Sexpr{digest::digest("DO","md5")} \end{itemize} \end{frame} \begin{frame}{MurmurHash3 -- properties} \begin{itemize} \item Simple \item Good distribution \item Good collision resistance \item Good avalanche properties (single bit in the input changes the output significantly). \only<2>{\begin{itemize} \item 0 -> {\texttt \Sexpr{digest::digest(0,"sha1")}} \item 1 -> {\texttt \Sexpr{digest::digest(1,"sha1")}} \item 2 -> {\texttt \Sexpr{digest::digest(2,"sha1")}} \end{itemize}} \item<3-> Fast \item<3-> Key questions: \begin{enumerate} \item Does it map DNA and protein sequences uniformly into output? \item Does the non-random input distribution of DNA/protein sequences expose pathologic behavior of the hash function? \end{enumerate} \item<3-> Partially answered by the smhasher test suite which exposed issues with MurmurHash2 (which is ideally the \href{http://www.phy.duke.edu/\%7Ergb/General/dieharder.php}{DieHarder} of hash testing.) \end{itemize} \end{frame} \begin{frame}{Why is this quick} \begin{itemize} \item Hashing is relatively fast \item Only read the data once \item Can be trivially parallized \item Only need to keep the $k$ lowest hash values for each dataset \item Much faster than string alignment \end{itemize} \includegraphics[width=\textwidth,height=0.4\textheight,keepaspectratio]{mash_minhash_paper/table_2.png} \end{frame} \begin{frame}{How does it stack up against ANI?} \begin{columns} \column{0.5\textwidth} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png} \column{0.5\textwidth} \begin{itemize} \item Sketch size $s$ \item k-mer size $k$ \item Mash D is the mash distance \item ANI is the average nucleotide identity between genomes \item Increasing $s$ improves match; $k$ seems to max near 27. \item ANI only considers core genome, so at some point ANI and Mash will diverge \end{itemize} \end{columns} \end{frame} \begin{frame}{Mash error bounds with $k=21$} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/table_1.png} \end{frame} \begin{frame} \begin{columns} \column{0.65\textwidth} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png} \column{0.35\textwidth} \begin{itemize} \item \textit{De novo} clustering of all RefSeq genomes \item Each node is a genome \item Nodes are connected if Mash distance $\le 0.05$ and $p \le 10^{-10}$ \end{itemize} \end{columns} \end{frame} \begin{frame} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png} Primate species trees calculated by mash (right) in comparison to UCSC genome browser trees (left) \end{frame} \begin{frame} \begin{columns} \column{0.65\textwidth} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png} \column{0.35\textwidth} \only<1>{ \begin{itemize} \item Comparison between mash and COMMET using Global ocean survey clustering \item Mostly reproduces the clusters seen in the original study \end{itemize} } \only<2>{ \begin{itemize} \item Comparison of sequences and assemblies from 888 sequencing runs and 879 assemblies from MHP and MetaHit projects \item Mostly reproduces the clusters seen in the original study \item COMMET would have required 140,000 hours to run this analysis; mash did it in 25 hours or so. \end{itemize} } \end{columns} \end{frame} \begin{frame}{Conclusions} \begin{itemize} \item Compare genomes on a massive scale which would be impossible for typical techniques \item Good for rapid triaging of sequencing data to identify outliers \item Metagenomics and identification of samples \item Caveats \begin{itemize} \item Based on k-mer sketch; possibility of batch effects \item Phylogeny reconstructions are approximate, and do not track mutation rates, etc. \item What about deconvolution of mixed samples? Probably need different techniques. \end{itemize} \end{itemize} \end{frame} \section*{References} \begin{frame}[plain]{References} \begin{center} \mbox{}\vspace{-\baselineskip} \printbibliography[heading=none] \end{center} \end{frame} \end{document}