\documentclass[ignorenonframetext]{beamer} \usepackage{fontspec} \setmainfont{FreeSerif} \setsansfont{FreeSans} \setmonofont{FreeMono} \usepackage{url} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage[bf]{caption} \usepackage{rotating} \usepackage{wrapfig} \usepackage{fancybox} \usepackage{booktabs} % \usepackage{multirow} \usepackage{acronym} \usepackage{qrcode} \usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex} \addbibresource{references.bib} % \usepackage[nomargin,inline,draft]{fixme} % \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}} % \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref} \usepackage{texshade} \usepackage{tikz} \usepackage{nameref} \usepackage{zref-xr,zref-user} \renewcommand*{\bibfont}{\tiny} % The textpos package is necessary to position textblocks at arbitary % places on the page. Use showboxes option to show outlines of textboxes. % \usepackage[absolute]{textpos} \usepackage[absolute,overlay]{textpos} \usepackage{mathtools,cancel} \renewcommand{\CancelColor}{\color{red}} %change cancel color to red \newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.6\textwidth,0.2\textheight)% \begin{block}{#1}}{% \end{block}\end{textblock*}} \usepackage{multirow} \usepackage{array} \mode{ \usetheme{CambridgeUS} \usecolortheme{crane} % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html \definecolor{ilboldblue}{HTML}{002058} \definecolor{ilboldorange}{HTML}{E87722} \definecolor{ilblue}{HTML}{606EB2} \definecolor{ilorange}{HTML}{D45D00} \logo{\begin{tikzpicture}% Pale figure {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}% };}% \end{tikzpicture}} } \title[MASH]{Mash: fast genome and metagenome distance estimation} \author[Don Armstrong]{Don L. Armstrong} \institute[IGB]{Institute for Genomic Biology, Computing Genomes for Reproductive Health, University of Illinois, Urbana-Champaign} \begin{document} <>= opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio") #opts_chunk$set(cache=TRUE, autodep=TRUE) options(device = function(file, width = 8, height = 7, ...) { cairo_pdf(tempfile(), width = width, height = height, ...) }) options(digits=2) library("data.table") library("ggplot2") library("reshape2") library("grid") library("xtable") @ \IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png} \url{https://xkcd.com/1691/}}} \frame[plain]{\titlepage \begin{center} Code and slides are here: \qrcode[padding]{http://dla2.us/p/mashminhash2016} \url{http://dla2.us/p/mashminhash2016} \end{center} } \frame[plain]{ \begin{center} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/paper_frontpage} \end{center} } \begin{frame}{MinHash Algorithm} \begin{columns} \column{0.3\textwidth} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_1.png} \column{0.7\textwidth} \begin{itemize} \item Decompose dataset into k-mers \only<2>{ \begin{digression}{What about strandedness} \begin{itemize} \item Take lexically lowest sequence \begin{itemize} \item Given 7-mers 5'-ACTGCAC-3' and its reverse complement, 5'-GTGCAGT-3' \item A $\lt$ G \item Use ACTGCAC \end{itemize} \item Because $S(A \cup B)$ is a random sample of $A \cup B$ the fraction of elements in $S(A \cup B)$ which are shared by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$ \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$ \end{itemize} \end{digression} } \item Hash k-mers (32/64bit) \item Estimate Jaccard Index $J(A,B)$ \only<3>{ \begin{digression}{Estimating Jaccard Index} $J(A,B) = \frac{|A \cap B|}{|A \cup B|}$ \begin{itemize} \item Sample randomly from $A$ and $B$ \item Because $S(A \cup B)$ is a random sample of $A \cup B$ the fraction of elements in $S(A \cup B)$ which are shared by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$ \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$ \end{itemize} \end{digression} } \item<3-> How can we randomly sample? \item<3-> Properties of the hash! \end{itemize} \end{columns} \end{frame} \begin{frame}{Hash functions} \end{frame} \begin{frame}{MurmerHash3 -- properties} \end{frame} \begin{frame} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png} \end{frame} \begin{frame} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png} \end{frame} \begin{frame} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png} \end{frame} \begin{frame} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png} \end{frame} \section*{References} \begin{frame}[plain]{References} \begin{center} \mbox{}\vspace{-\baselineskip} \printbibliography[heading=none] \end{center} \end{frame} \end{document}