From: Don Armstrong Date: Wed, 22 Jun 2016 17:15:52 +0000 (-0700) Subject: add initial work on minhash paper slides for hpcbio X-Git-Url: https://git.donarmstrong.com/?p=mash_minhash_presentation.git;a=commitdiff_plain;h=7bfd2a048281fa8b147b7bdfd59503798aadbf36 add initial work on minhash paper slides for hpcbio --- 7bfd2a048281fa8b147b7bdfd59503798aadbf36 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6647d1b --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +auto +figure +*.aux +*.bbl +*.bcf +*.blg +*.fdb_latexmk +*.fls +*.log +*.out +hpcbio_mash_minhash_jun_2016.pdf +hpcbio_mash_minhash_jun_2016.tex +*.run.xml +*.rip +cache +*.lof +*.lot +*.toc +*.fff +*.ttt +references.bib +*.nav +*.snm diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..10d1627 --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +#!/usr/bin/make -f + +all: hpcbio_mash_minhash_jun_2016.pdf + +R ?= R + +%.pdf: %.svg + inkscape -A $@ $< + pdfcrop $@ + mv $(dir $@)*-crop.pdf $@ + +%.png: %.svg + inkscape -e $@ -d 300 $< + +%.tex: %.Rnw + $(R) --encoding=utf-8 -e "library('knitr'); knit('$<')" + +hpcbio_mash_minhash_jun_2016.pdf: hpcbio_mash_minhash_jun_2016.tex mash_minhash_paper figures + +%.pdf: %.tex $(wildcard *.bib) $(wildcard *.tex) + latexmk -pdf -pdflatex='xelatex -interaction=nonstopmode %O %S' -bibtex -use-make $< + diff --git a/figures/.gitignore b/figures/.gitignore new file mode 100644 index 0000000..e8c4a45 --- /dev/null +++ b/figures/.gitignore @@ -0,0 +1,7 @@ +igb_wordmark.pdf +igb_wordmarks.zip +relevant_xkcd.png +uofi_mark.pdf +uofi_mark.zip +reduced_alphabet.png +spalax.jpg \ No newline at end of file diff --git a/figures/Makefile b/figures/Makefile new file mode 100644 index 0000000..4c3ae09 --- /dev/null +++ b/figures/Makefile @@ -0,0 +1,29 @@ +#!/usr/bin/make -f + +all: uofi_mark.pdf igb_wordmark.pdf spalax.jpg reduced_alphabet.png + +igb_wordmarks.zip: + wget -O $@ "http://www.igb.illinois.edu/sites/default/files/upload/Wordmarks-Eps.zip" + +igb_wordmark.pdf: igb_wordmarks.zip + unzip igb_wordmarks.zip Wordmark-black-tag-outlines.eps + epstopdf --outfile=$@ Wordmark-black-tag-outlines.eps + rm -f Wordmark-black-tag-outlines.eps + +uofi_mark.zip: + wget -O $@ "http://identitystandards.illinois.edu/assets/logos/uclogo_1867_horz_bold.zip" + +uofi_mark.pdf: uofi_mark.zip + unzip uofi_mark.zip uclogo_1867_horz_bold/pc/uclogo_1867_horz_bold.eps + epstopdf --outfile=$@ uclogo_1867_horz_bold/pc/uclogo_1867_horz_bold.eps + rm -rf uclogo_1867_horz_bold + +relevant_xkcd.png: + wget -O $@ "http://imgs.xkcd.com/comics/optimization.png" + +spalax.jpg: + wget -O $@ "https://upload.wikimedia.org/wikipedia/commons/9/90/Spalax_microphthalmus_Syzran_Russia.jpg" + +reduced_alphabet.png: + wget -O temp.jpg "http://peds.oxfordjournals.org/content/13/3/149/F1.large.jpg" + convert temp.jpg $@ diff --git a/hpcbio_mash_minhash_jun_2016.Rnw b/hpcbio_mash_minhash_jun_2016.Rnw new file mode 100644 index 0000000..9b13264 --- /dev/null +++ b/hpcbio_mash_minhash_jun_2016.Rnw @@ -0,0 +1,176 @@ +\documentclass[ignorenonframetext]{beamer} +\usepackage{fontspec} +\setmainfont{FreeSerif} +\setsansfont{FreeSans} +\setmonofont{FreeMono} +\usepackage{url} +\usepackage{fancyhdr} +\usepackage{graphicx} +\usepackage[bf]{caption} +\usepackage{rotating} +\usepackage{wrapfig} +\usepackage{fancybox} +\usepackage{booktabs} +% \usepackage{multirow} +\usepackage{acronym} +\usepackage{qrcode} +\usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex} +\addbibresource{references.bib} +% \usepackage[nomargin,inline,draft]{fixme} +% \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}} +% \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref} +\usepackage{texshade} +\usepackage{tikz} +\usepackage{nameref} +\usepackage{zref-xr,zref-user} +\renewcommand*{\bibfont}{\tiny} + +% The textpos package is necessary to position textblocks at arbitary +% places on the page. Use showboxes option to show outlines of textboxes. +% \usepackage[absolute]{textpos} +\usepackage[absolute,overlay]{textpos} +\usepackage{mathtools,cancel} + +\renewcommand{\CancelColor}{\color{red}} %change cancel color to red +\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.6\textwidth,0.2\textheight)% + \begin{block}{#1}}{% +\end{block}\end{textblock*}} + + +\usepackage{multirow} +\usepackage{array} + +\mode{ + \usetheme{CambridgeUS} + \usecolortheme{crane} + % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html + \definecolor{ilboldblue}{HTML}{002058} + \definecolor{ilboldorange}{HTML}{E87722} + \definecolor{ilblue}{HTML}{606EB2} + \definecolor{ilorange}{HTML}{D45D00} + \logo{\begin{tikzpicture}% Pale figure + {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}% + };}% + \end{tikzpicture}} +} + +\title[MASH]{Mash: fast genome and metagenome distance estimation} +\author[Don Armstrong]{Don L. Armstrong} +\institute[IGB]{Institute for Genomic Biology, Computing Genomes + for Reproductive Health, University of Illinois, Urbana-Champaign} + +\begin{document} + +<>= +opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio") +#opts_chunk$set(cache=TRUE, autodep=TRUE) +options(device = function(file, width = 8, height = 7, ...) { + cairo_pdf(tempfile(), width = width, height = height, ...) +}) +options(digits=2) +library("data.table") +library("ggplot2") +library("reshape2") +library("grid") +library("xtable") + +@ + +\IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png} + + \url{https://xkcd.com/1691/}}} + +\frame[plain]{\titlepage + \begin{center} + Code and slides are here: + + \qrcode[padding]{http://dla2.us/p/mashminhash2016} + + \url{http://dla2.us/p/mashminhash2016} + \end{center} + } + + +\frame[plain]{ + \begin{center} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/paper_frontpage} +\end{center} +} + +\begin{frame}{MinHash Algorithm} + \begin{columns} + \column{0.3\textwidth} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_1.png} + \column{0.7\textwidth} + \begin{itemize} + \item Decompose dataset into k-mers + \only<2>{ + \begin{digression}{What about strandedness} + \begin{itemize} + \item Take lexically lowest sequence + \begin{itemize} + \item Given 7-mers 5'-ACTGCAC-3' and its reverse complement, 5'-GTGCAGT-3' + \item A $\lt$ G + \item Use ACTGCAC + \end{itemize} + \item Because $S(A \cup B)$ is a random sample of $A \cup B$ + the fraction of elements in $S(A \cup B)$ which are shared + by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$ + \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$ + \end{itemize} + \end{digression} + } + \item Hash k-mers (32/64bit) + \item Estimate Jaccard Index $J(A,B)$ + \only<3>{ + \begin{digression}{Estimating Jaccard Index} + $J(A,B) = \frac{|A \cap B|}{|A \cup B|}$ + \begin{itemize} + \item Sample randomly from $A$ and $B$ + \item Because $S(A \cup B)$ is a random sample of $A \cup B$ + the fraction of elements in $S(A \cup B)$ which are shared + by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$ + \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$ + \end{itemize} + \end{digression} + } + \item<3-> How can we randomly sample? + \item<3-> Properties of the hash! + \end{itemize} + \end{columns} +\end{frame} + +\begin{frame}{Hash functions} + +\end{frame} + +\begin{frame}{MurmerHash3 -- properties} +\end{frame} + +\begin{frame} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png} +\end{frame} + +\begin{frame} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png} +\end{frame} + +\begin{frame} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png} +\end{frame} + +\begin{frame} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png} +\end{frame} + + +\section*{References} + +\begin{frame}[plain]{References} + \begin{center} + \mbox{}\vspace{-\baselineskip} + \printbibliography[heading=none] + \end{center} +\end{frame} + +\end{document} diff --git a/mash_minhash_paper/.gitignore b/mash_minhash_paper/.gitignore new file mode 100644 index 0000000..05b64ec --- /dev/null +++ b/mash_minhash_paper/.gitignore @@ -0,0 +1,8 @@ +*.jpg +*.png +*.gif +paper.pdf +supplemental_info.pdf +paper_images +supplemental_images +supp_data.xlsx \ No newline at end of file diff --git a/mash_minhash_paper/Makefile b/mash_minhash_paper/Makefile new file mode 100644 index 0000000..3fd0220 --- /dev/null +++ b/mash_minhash_paper/Makefile @@ -0,0 +1,27 @@ +#!/usr/bin/make -f + +all: fig_1.png fig_2.png fig_3.png fig_4.png fig_5.png paper_frontpage.png + +paper_images: paper.pdf + pdfimages -j -png $< paper + touch $@ + +supplemental_images: supplemental_info.pdf + pdfimages -j -png $< supplemental && touch $@ + +paper.pdf: + wget -O $@ "http://download.springer.com/static/pdf/329/art%253A10.1186%252Fs13059-016-0997-x.pdf?originUrl=http%3A%2F%2Fgenomebiology.biomedcentral.com%2Farticle%2F10.1186%2Fs13059-016-0997-x&token2=exp=1466533957~acl=%2Fstatic%2Fpdf%2F329%2Fart%25253A10.1186%25252Fs13059-016-0997-x.pdf*~hmac=9273fd8d3eace11243411b29e4939168805da42e97cb7668f1fedd782d4389e8" + +paper_frontpage.png: paper.pdf + convert -density 300 -depth 8 -quality 85 -crop '2100x1600+200+100' '$<[0]' $@ + +fig_%.gif: + wget -O $@ "https://static-content.springer.com/image/art%3A10.1186%2Fs13059-016-0997-x/MediaObjects/13059_2016_997_Fig$(*)_HTML.gif" + + +fig_%.png: fig_%.gif + convert -trim +repage $< $@ + +supplemental_info.pdf: + wget -O $@ "https://static-content.springer.com/esm/art%3A10.1186%2Fs13059-016-0997-x/MediaObjects/13059_2016_997_MOESM1_ESM.pdf" +