hpcbio_mash_minhash_jun_2016.Rnw

   1 \documentclass[ignorenonframetext]{beamer}
   2 \usepackage{fontspec}
   3 \setmainfont{FreeSerif}
   4 \setsansfont{FreeSans}
   5 \setmonofont{FreeMono}
   6 \usepackage{url}
   7 \usepackage{fancyhdr}
   8 \usepackage{graphicx}
   9 \usepackage[bf]{caption}
  10 \usepackage{rotating}
  11 \usepackage{wrapfig}
  12 \usepackage{fancybox}
  13 \usepackage{booktabs}
  14 % \usepackage{multirow}
  15 \usepackage{acronym}
  16 \usepackage{qrcode}
  17 \usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex}
  18 \addbibresource{references.bib}
  19 % \usepackage[nomargin,inline,draft]{fixme}
  20 % \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}}
  21 % \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref}
  22 \usepackage{texshade}
  23 \usepackage{tikz}
  24 \usepackage{nameref}
  25 \usepackage{zref-xr,zref-user}
  26 \renewcommand*{\bibfont}{\tiny}
  27
  28 % The textpos package is necessary to position textblocks at arbitary
  29 % places on the page.  Use showboxes option to show outlines of textboxes.
  30 % \usepackage[absolute]{textpos}
  31 \usepackage[absolute,overlay]{textpos}
  32 \usepackage{mathtools,cancel}
  33
  34 \renewcommand{\CancelColor}{\color{red}} %change cancel color to red
  35 \newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.2\textwidth,0.2\textheight)%
  36     \begin{block}{#1}}{%
  37 \end{block}\end{textblock*}}
  38
  39
  40 \usepackage{multirow}
  41 \usepackage{array}
  42
  43 \mode<presentation>{
  44   \usetheme{CambridgeUS}
  45   \usecolortheme{crane}
  46   % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html
  47   \definecolor{ilboldblue}{HTML}{002058}
  48   \definecolor{ilboldorange}{HTML}{E87722}
  49   \definecolor{ilblue}{HTML}{606EB2}
  50   \definecolor{ilorange}{HTML}{D45D00}
  51   \logo{\begin{tikzpicture}% Pale figure
  52       {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}%
  53         };}%
  54     \end{tikzpicture}}
  55 }
  56
  57 \title[MASH]{Mash: fast genome and metagenome distance estimation}
  58 \author[Don Armstrong]{Don L. Armstrong}
  59 \institute[IGB]{Institute for Genomic Biology, Computing Genomes
  60   for Reproductive Health, University of Illinois, Urbana-Champaign}
  61
  62 \begin{document}
  63
  64 <<load.libraries,echo=FALSE,results="hide",warning=FALSE,message=FALSE,error=FALSE,cache=FALSE>>=
  65 opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio")
  66 #opts_chunk$set(cache=TRUE, autodep=TRUE)
  67 options(device = function(file, width = 8, height = 7, ...) {
  68   cairo_pdf(tempfile(), width = width, height = height, ...)
  69 })
  70 options(digits=2)
  71 library("data.table")
  72 library("ggplot2")
  73 library("reshape2")
  74 library("grid")
  75 library("xtable")
  76
  77 @
  78
  79 \IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png}
  80
  81     \url{https://xkcd.com/1691/}}}
  82
  83 \frame[plain]{\titlepage
  84   \begin{center}
  85     Code and slides are here:
  86
  87     \qrcode[padding]{http://dla2.us/p/mashminhash2016}
  88
  89     \url{http://dla2.us/p/mashminhash2016}
  90    \end{center}
  91  }
  92
  93
  94 \frame[plain]{
  95   \begin{center}
  96   \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/paper_frontpage}
  97 \end{center}
  98 }
  99
 100 \begin{frame}{The Problem}
 101 \end{frame}
 102
 103 \begin{frame}{MinHash Algorithm}
 104   \begin{columns}
 105     \column{0.3\textwidth}
 106     \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_1.png}
 107     \column{0.7\textwidth}
 108     \begin{itemize}
 109     \item Decompose dataset into k-mers
 110       \only<2>{
 111         \begin{digression}{What about strandedness}
 112           \begin{itemize}
 113           \item Take lexically lowest sequence
 114             \begin{itemize}
 115             \item Given 7-mers 5'-ACTGCAC-3' and its reverse complement, 5'-GTGCAGT-3'
 116             \item A $\lt$ G
 117             \item Use ACTGCAC
 118             \end{itemize}
 119           \item Because $S(A \cup B)$ is a random sample of $A \cup B$
 120             the fraction of elements in $S(A \cup B)$ which are shared
 121             by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$
 122           \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$
 123           \end{itemize}
 124         \end{digression}
 125       }
 126     \item Hash k-mers (32/64bit)
 127     \item Estimate Jaccard Index $J(A,B)$
 128       \only<4>{
 129         \begin{digression}{Estimating Jaccard Index}
 130           $J(A,B) = \frac{|A \cap B|}{|A \cup B|}$
 131           \begin{itemize}
 132           \item Sample randomly from $A$ and $B$
 133           \item Because $S(A \cup B)$ is a random sample of $A \cup B$
 134             the fraction of elements in $S(A \cup B)$ which are shared
 135             by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$
 136           \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$
 137           \end{itemize}
 138         \end{digression}
 139       }
 140     \item<4-> How can we randomly sample?
 141     \item<4-> Properties of the hash!
 142     \end{itemize}
 143   \end{columns}
 144 \end{frame}
 145
 146 \begin{frame}{Hash functions and their properties}
 147   \begin{itemize}
 148   \item A function which can map arbitrary sized data to output of
 149     fixed size
 150   \item They're used everywhere: caches, duplicate filtering,
 151     cryptography, finding similar substrings, finding similar records,
 152     etc.
 153   \item Good hash functions are deterministic, \alert{uniform}, and
 154     rarely produce collisions.
 155   \item Cryptographic hash functions are non-invertible (SHA-512,
 156     etc.)
 157   \item Other hash functions may have continuity (IE, AAAB and AAAC
 158     will hash to nearby values)
 159   \item for example, the MD5 of “DON” is \Sexpr{digest::digest("DON","md5")} while “DO” is
 160     \Sexpr{digest::digest("DO","md5")}
 161   \end{itemize}
 162 \end{frame}
 163
 164 \begin{frame}{MurmurHash3 -- properties}
 165   \begin{itemize}
 166   \item Simple
 167   \item Good distribution
 168   \item Good collision resistance
 169   \item Good avalanche properties (single bit in the input changes the output significantly).
 170     \only<2>{\begin{itemize}
 171     \item 0 -> {\texttt \Sexpr{digest::digest(0,"sha1")}}
 172     \item 1 -> {\texttt \Sexpr{digest::digest(1,"sha1")}}
 173     \item 2 -> {\texttt \Sexpr{digest::digest(2,"sha1")}}
 174     \end{itemize}}
 175   \item<3-> Fast
 176   \item<3-> Key questions:
 177     \begin{enumerate}
 178     \item Does it map DNA and protein sequences uniformly into output?
 179     \item Does the non-random input distribution of DNA/protein
 180       sequences expose pathologic behavior of the hash function?
 181     \end{enumerate}
 182   \item<3-> Partially answered by the smhasher test suite which
 183     exposed issues with MurmurHash2 (which is ideally the
 184     \href{http://www.phy.duke.edu/\%7Ergb/General/dieharder.php}{DieHarder}
 185     of hash testing.)
 186   \end{itemize}
 187 \end{frame}
 188
 189 \begin{frame}{Why is this quick}
 190   \begin{itemize}
 191   \item Hashing is relatively fast
 192   \item Only read the data once
 193   \item Can be trivially parallized
 194   \item Only need to keep the $k$ lowest hash values for each dataset
 195   \item Much faster than string alignment
 196   \end{itemize}
 197   \includegraphics[width=\textwidth,height=0.4\textheight,keepaspectratio]{mash_minhash_paper/table_2.png}
 198 \end{frame}
 199
 200
 201 \begin{frame}{How does it stack up against ANI?}
 202   \begin{columns}
 203     \column{0.5\textwidth}
 204   \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png}
 205     \column{0.5\textwidth}
 206   \begin{itemize}
 207   \item Sketch size $s$
 208   \item k-mer size $k$
 209   \item Mash D is the mash distance
 210   \item ANI is the average nucleotide identity between genomes
 211   \item Increasing $s$ improves match; $k$ seems to max near 27.
 212   \item ANI only considers core genome, so at some point ANI and Mash
 213     will diverge
 214   \end{itemize}
 215 \end{columns}
 216 \end{frame}
 217
 218 \begin{frame}{Mash error bounds with $k=21$}
 219   \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/table_1.png}
 220
 221 \end{frame}
 222
 223 \begin{frame}
 224   \begin{columns}
 225     \column{0.65\textwidth}
 226     \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png}
 227     \column{0.35\textwidth}
 228     \begin{itemize}
 229     \item \textit{De novo} clustering of all RefSeq genomes
 230     \item Each node is a genome
 231     \item Nodes are connected if Mash distance $\le 0.05$ and
 232       $p \le 10^{-10}$
 233     \end{itemize}
 234   \end{columns}
 235 \end{frame}
 236
 237 \begin{frame}
 238   \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png}
 239
 240   Primate species trees calculated by mash (right) in comparison to UCSC genome browser trees (left)
 241 \end{frame}
 242
 243 \begin{frame}
 244   \begin{columns}
 245     \column{0.65\textwidth}
 246   \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png}
 247     \column{0.35\textwidth}
 248     \only<1>{
 249       \begin{itemize}
 250       \item Comparison between mash and COMMET using Global ocean
 251         survey clustering
 252       \item Mostly reproduces the clusters seen in the original study
 253       \end{itemize}
 254     }
 255     \only<2>{
 256       \begin{itemize}
 257       \item Comparison of sequences and assemblies from 888 sequencing
 258         runs and 879 assemblies from MHP and MetaHit projects
 259       \item Mostly reproduces the clusters seen in the original study
 260       \item COMMET would have required 140,000 hours to run this
 261         analysis; mash did it in 25 hours or so.
 262       \end{itemize}
 263
 264     }
 265   \end{columns}
 266
 267
 268 \end{frame}
 269
 270 \begin{frame}{Conclusions}
 271 \begin{itemize}
 272
 273 \item Compare genomes on a massive scale which would be impossible for
 274   typical techniques
 275 \item Good for rapid triaging of sequencing data to identify outliers
 276 \item Metagenomics and identification of samples
 277 \item Caveats
 278 \begin{itemize}
 279 \item Based on k-mer sketch; possibility of batch effects
 280 \item Phylogeny reconstructions are approximate, and do not track
 281   mutation rates, etc.
 282 \item What about deconvolution of mixed samples? Probably need different techniques.
 283 \end{itemize}
 284 \end{itemize}
 285 \end{frame}
 286
 287
 288 \section*{References}
 289
 290 \begin{frame}[plain]{References}
 291   \begin{center}
 292     \mbox{}\vspace{-\baselineskip}
 293     \printbibliography[heading=none]
 294   \end{center}
 295 \end{frame}
 296
 297 \end{document}