add initial work on minhash paper slides for hpcbio

author Don Armstrong <don@donarmstrong.com>

Wed, 22 Jun 2016 17:15:52 +0000 (10:15 -0700)

committer Don Armstrong <don@donarmstrong.com>

Wed, 22 Jun 2016 17:15:52 +0000 (10:15 -0700)
author Don Armstrong <don@donarmstrong.com>
Wed, 22 Jun 2016 17:15:52 +0000 (10:15 -0700)
committer Don Armstrong <don@donarmstrong.com>
Wed, 22 Jun 2016 17:15:52 +0000 (10:15 -0700)
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..6647d1b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,23 @@
+auto
+figure
+*.aux
+*.bbl
+*.bcf
+*.blg
+*.fdb_latexmk
+*.fls
+*.log
+*.out
+hpcbio_mash_minhash_jun_2016.pdf
+hpcbio_mash_minhash_jun_2016.tex
+*.run.xml
+*.rip
+cache
+*.lof
+*.lot
+*.toc
+*.fff
+*.ttt
+references.bib
+*.nav
+*.snm
diff --git a/Makefile b/Makefile

new file mode 100644 (file)

index 0000000..10d1627
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,22 @@
+#!/usr/bin/make -f
+
+all: hpcbio_mash_minhash_jun_2016.pdf
+
+R ?= R
+
+%.pdf: %.svg
+       inkscape -A $@ $<
+       pdfcrop $@
+       mv $(dir $@)*-crop.pdf $@
+
+%.png: %.svg
+       inkscape -e $@ -d 300 $<
+
+%.tex: %.Rnw
+       $(R) --encoding=utf-8 -e "library('knitr'); knit('$<')"
+
+hpcbio_mash_minhash_jun_2016.pdf: hpcbio_mash_minhash_jun_2016.tex mash_minhash_paper figures
+
+%.pdf: %.tex $(wildcard *.bib) $(wildcard *.tex)
+       latexmk -pdf -pdflatex='xelatex -interaction=nonstopmode %O %S' -bibtex -use-make $<
+
diff --git a/figures/.gitignore b/figures/.gitignore

new file mode 100644 (file)

index 0000000..e8c4a45
--- /dev/null
+++ b/figures/.gitignore
@@ -0,0 +1,7 @@
+igb_wordmark.pdf
+igb_wordmarks.zip
+relevant_xkcd.png
+uofi_mark.pdf
+uofi_mark.zip
+reduced_alphabet.png
+spalax.jpg
+\ No newline at end of file
diff --git a/figures/Makefile b/figures/Makefile

new file mode 100644 (file)

index 0000000..4c3ae09
--- /dev/null
+++ b/figures/Makefile
@@ -0,0 +1,29 @@
+#!/usr/bin/make -f
+
+all: uofi_mark.pdf igb_wordmark.pdf spalax.jpg reduced_alphabet.png
+
+igb_wordmarks.zip: 
+       wget -O $@ "http://www.igb.illinois.edu/sites/default/files/upload/Wordmarks-Eps.zip"
+
+igb_wordmark.pdf: igb_wordmarks.zip
+       unzip igb_wordmarks.zip Wordmark-black-tag-outlines.eps
+       epstopdf --outfile=$@ Wordmark-black-tag-outlines.eps
+       rm -f Wordmark-black-tag-outlines.eps
+
+uofi_mark.zip: 
+       wget -O $@ "http://identitystandards.illinois.edu/assets/logos/uclogo_1867_horz_bold.zip"
+
+uofi_mark.pdf: uofi_mark.zip
+       unzip uofi_mark.zip uclogo_1867_horz_bold/pc/uclogo_1867_horz_bold.eps
+       epstopdf --outfile=$@ uclogo_1867_horz_bold/pc/uclogo_1867_horz_bold.eps
+       rm -rf uclogo_1867_horz_bold
+
+relevant_xkcd.png:
+       wget -O $@ "http://imgs.xkcd.com/comics/optimization.png"
+
+spalax.jpg:
+       wget -O $@ "https://upload.wikimedia.org/wikipedia/commons/9/90/Spalax_microphthalmus_Syzran_Russia.jpg"
+
+reduced_alphabet.png:
+       wget -O temp.jpg "http://peds.oxfordjournals.org/content/13/3/149/F1.large.jpg"
+       convert temp.jpg $@
diff --git a/hpcbio_mash_minhash_jun_2016.Rnw b/hpcbio_mash_minhash_jun_2016.Rnw

new file mode 100644 (file)

index 0000000..9b13264
--- /dev/null
+++ b/hpcbio_mash_minhash_jun_2016.Rnw
@@ -0,0 +1,176 @@
+\documentclass[ignorenonframetext]{beamer}
+\usepackage{fontspec}
+\setmainfont{FreeSerif}
+\setsansfont{FreeSans}
+\setmonofont{FreeMono}
+\usepackage{url}
+\usepackage{fancyhdr}
+\usepackage{graphicx}
+\usepackage[bf]{caption}
+\usepackage{rotating}
+\usepackage{wrapfig}
+\usepackage{fancybox}
+\usepackage{booktabs}
+% \usepackage{multirow}
+\usepackage{acronym}
+\usepackage{qrcode}
+\usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex}
+\addbibresource{references.bib}
+% \usepackage[nomargin,inline,draft]{fixme}
+% \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}}
+% \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref}
+\usepackage{texshade}
+\usepackage{tikz}
+\usepackage{nameref}
+\usepackage{zref-xr,zref-user}
+\renewcommand*{\bibfont}{\tiny}
+
+% The textpos package is necessary to position textblocks at arbitary 
+% places on the page.  Use showboxes option to show outlines of textboxes.
+% \usepackage[absolute]{textpos}
+\usepackage[absolute,overlay]{textpos}
+\usepackage{mathtools,cancel}
+
+\renewcommand{\CancelColor}{\color{red}} %change cancel color to red
+\newenvironment{digression}[1]{\begin{textblock*}{64mm}(0.6\textwidth,0.2\textheight)%
+    \begin{block}{#1}}{%
+\end{block}\end{textblock*}}
+
+
+\usepackage{multirow}
+\usepackage{array}
+
+\mode<presentation>{ 
+  \usetheme{CambridgeUS}
+  \usecolortheme{crane}
+  % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html
+  \definecolor{ilboldblue}{HTML}{002058}
+  \definecolor{ilboldorange}{HTML}{E87722}
+  \definecolor{ilblue}{HTML}{606EB2}
+  \definecolor{ilorange}{HTML}{D45D00}
+  \logo{\begin{tikzpicture}% Pale figure
+      {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}%
+        };}%
+    \end{tikzpicture}}
+}
+
+\title[MASH]{Mash: fast genome and metagenome distance estimation}
+\author[Don Armstrong]{Don L. Armstrong}
+\institute[IGB]{Institute for Genomic Biology, Computing Genomes 
+  for Reproductive Health, University of Illinois, Urbana-Champaign}
+
+\begin{document}
+
+<<load.libraries,echo=FALSE,results="hide",warning=FALSE,message=FALSE,error=FALSE,cache=FALSE>>=
+opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio")
+#opts_chunk$set(cache=TRUE, autodep=TRUE)
+options(device = function(file, width = 8, height = 7, ...) {
+  cairo_pdf(tempfile(), width = width, height = height, ...)
+})
+options(digits=2)
+library("data.table")
+library("ggplot2")
+library("reshape2")
+library("grid")
+library("xtable")
+
+@ 
+
+\IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png}
+
+    \url{https://xkcd.com/1691/}}}
+
+\frame[plain]{\titlepage
+  \begin{center}
+    Code and slides are here: 
+    
+    \qrcode[padding]{http://dla2.us/p/mashminhash2016}
+    
+    \url{http://dla2.us/p/mashminhash2016}
+   \end{center}
+ }
+
+
+\frame[plain]{
+  \begin{center}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/paper_frontpage}
+\end{center}
+}
+
+\begin{frame}{MinHash Algorithm}
+  \begin{columns}
+    \column{0.3\textwidth}
+    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_1.png}
+    \column{0.7\textwidth}
+    \begin{itemize}
+    \item Decompose dataset into k-mers
+      \only<2>{
+        \begin{digression}{What about strandedness}
+          \begin{itemize} 
+          \item Take lexically lowest sequence
+            \begin{itemize}
+            \item Given 7-mers 5'-ACTGCAC-3' and its reverse complement, 5'-GTGCAGT-3'
+            \item A $\lt$ G
+            \item Use ACTGCAC
+            \end{itemize}
+          \item Because $S(A \cup B)$ is a random sample of $A \cup B$
+            the fraction of elements in $S(A \cup B)$ which are shared
+            by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$
+          \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$
+          \end{itemize}
+        \end{digression}
+      }
+    \item Hash k-mers (32/64bit)
+    \item Estimate Jaccard Index $J(A,B)$
+      \only<3>{
+        \begin{digression}{Estimating Jaccard Index}
+          $J(A,B) = \frac{|A \cap B|}{|A \cup B|}$ 
+          \begin{itemize} 
+          \item Sample randomly from $A$ and $B$
+          \item Because $S(A \cup B)$ is a random sample of $A \cup B$
+            the fraction of elements in $S(A \cup B)$ which are shared
+            by $S(A)$ and $S(B)$ is an unbiased estimate of $J(A,B)$
+          \item $J(A,B) \approx \frac{|S(A \cup B) \cap S(A) \cap (B)}{|S(A \cup B)|}$
+          \end{itemize}
+        \end{digression}
+      }
+    \item<3-> How can we randomly sample?
+    \item<3-> Properties of the hash!
+    \end{itemize}
+  \end{columns}
+\end{frame}
+
+\begin{frame}{Hash functions}
+  
+\end{frame}
+
+\begin{frame}{MurmerHash3 -- properties}
+\end{frame}
+
+\begin{frame}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_2.png}
+\end{frame}
+
+\begin{frame}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_3.png}
+\end{frame}
+
+\begin{frame}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_4.png}
+\end{frame}
+
+\begin{frame}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{mash_minhash_paper/fig_5.png}
+\end{frame}
+
+
+\section*{References}
+
+\begin{frame}[plain]{References}
+  \begin{center}
+    \mbox{}\vspace{-\baselineskip}
+    \printbibliography[heading=none]
+  \end{center}
+\end{frame}
+
+\end{document}
diff --git a/mash_minhash_paper/.gitignore b/mash_minhash_paper/.gitignore

new file mode 100644 (file)

index 0000000..05b64ec
--- /dev/null
+++ b/mash_minhash_paper/.gitignore
@@ -0,0 +1,8 @@
+*.jpg
+*.png
+*.gif
+paper.pdf
+supplemental_info.pdf
+paper_images
+supplemental_images
+supp_data.xlsx
+\ No newline at end of file
diff --git a/mash_minhash_paper/Makefile b/mash_minhash_paper/Makefile

new file mode 100644 (file)

index 0000000..3fd0220
--- /dev/null
+++ b/mash_minhash_paper/Makefile
@@ -0,0 +1,27 @@
+#!/usr/bin/make -f
+
+all: fig_1.png fig_2.png fig_3.png fig_4.png fig_5.png paper_frontpage.png
+
+paper_images: paper.pdf
+       pdfimages -j -png $< paper
+       touch $@
+
+supplemental_images: supplemental_info.pdf
+       pdfimages -j -png $< supplemental && touch $@
+
+paper.pdf:
+       wget -O $@ "http://download.springer.com/static/pdf/329/art%253A10.1186%252Fs13059-016-0997-x.pdf?originUrl=http%3A%2F%2Fgenomebiology.biomedcentral.com%2Farticle%2F10.1186%2Fs13059-016-0997-x&token2=exp=1466533957~acl=%2Fstatic%2Fpdf%2F329%2Fart%25253A10.1186%25252Fs13059-016-0997-x.pdf*~hmac=9273fd8d3eace11243411b29e4939168805da42e97cb7668f1fedd782d4389e8"
+
+paper_frontpage.png: paper.pdf
+       convert -density 300 -depth 8 -quality 85 -crop '2100x1600+200+100' '$<[0]' $@
+
+fig_%.gif:
+       wget -O $@ "https://static-content.springer.com/image/art%3A10.1186%2Fs13059-016-0997-x/MediaObjects/13059_2016_997_Fig$(*)_HTML.gif"
+
+
+fig_%.png: fig_%.gif
+       convert -trim +repage $< $@
+
+supplemental_info.pdf:
+       wget -O $@ "https://static-content.springer.com/esm/art%3A10.1186%2Fs13059-016-0997-x/MediaObjects/13059_2016_997_MOESM1_ESM.pdf"
+
author	Don Armstrong <don@donarmstrong.com>
	Wed, 22 Jun 2016 17:15:52 +0000 (10:15 -0700)
committer	Don Armstrong <don@donarmstrong.com>
	Wed, 22 Jun 2016 17:15:52 +0000 (10:15 -0700)
.gitignore	[new file with mode: 0644]	patch \| blob
Makefile	[new file with mode: 0644]	patch \| blob
figures/.gitignore	[new file with mode: 0644]	patch \| blob
figures/Makefile	[new file with mode: 0644]	patch \| blob
hpcbio_mash_minhash_jun_2016.Rnw	[new file with mode: 0644]	patch \| blob
mash_minhash_paper/.gitignore	[new file with mode: 0644]	patch \| blob
mash_minhash_paper/Makefile	[new file with mode: 0644]	patch \| blob