diamond_presentation_2015.Rnw

   1 \documentclass[ignorenonframetext]{beamer}
   2 \usepackage{fontspec}
   3 \setmainfont{FreeSerif}
   4 \setsansfont{FreeSans}
   5 \setmonofont{FreeMono}
   6 \usepackage{url}
   7 \usepackage{fancyhdr}
   8 \usepackage{graphicx}
   9 \usepackage[bf]{caption}
  10 \usepackage{rotating}
  11 \usepackage{wrapfig}
  12 \usepackage{fancybox}
  13 \usepackage{booktabs}
  14 % \usepackage{multirow}
  15 \usepackage{acronym}
  16 \usepackage{qrcode}
  17 \usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex}
  18 \addbibresource{references.bib}
  19 % \usepackage[nomargin,inline,draft]{fixme}
  20 % \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}}
  21 % \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref}
  22 \usepackage{texshade}
  23 \usepackage{tikz}
  24 \usepackage{nameref}
  25 \usepackage{zref-xr,zref-user}
  26 \renewcommand*{\bibfont}{\tiny}
  27
  28 % The textpos package is necessary to position textblocks at arbitary
  29 % places on the page.  Use showboxes option to show outlines of textboxes.
  30 % \usepackage[absolute]{textpos}
  31 \usepackage[absolute,overlay]{textpos}
  32 \usepackage{mathtools,cancel}
  33
  34 \renewcommand{\CancelColor}{\color{red}} %change cancel color to red
  35
  36 \usepackage{multirow}
  37 \usepackage{array}
  38
  39 \mode<presentation>{
  40   \usetheme{CambridgeUS}
  41   \usecolortheme{crane}
  42   % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html
  43   \definecolor{ilboldblue}{HTML}{002058}
  44   \definecolor{ilboldorange}{HTML}{E87722}
  45   \definecolor{ilblue}{HTML}{606EB2}
  46   \definecolor{ilorange}{HTML}{D45D00}
  47   \logo{\begin{tikzpicture}% Pale figure
  48       {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}%
  49         };}%
  50     \end{tikzpicture}}
  51 }
  52
  53 \title[DIAMOND]{DIAMOND: Fast protein alignment}
  54 \author[Don Armstrong]{Don L. Armstrong}
  55 \institute[IGB]{Institute for Genomic Biology, Computing Genomes
  56   for Reproductive Health, University of Illinois, Urbana-Champaign}
  57
  58 \begin{document}
  59
  60 <<load.libraries,echo=FALSE,results="hide",warning=FALSE,message=FALSE,error=FALSE,cache=FALSE>>=
  61 opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio")
  62 #opts_chunk$set(cache=TRUE, autodep=TRUE)
  63 options(device = function(file, width = 8, height = 7, ...) {
  64   cairo_pdf(tempfile(), width = width, height = height, ...)
  65 })
  66 options(digits=2)
  67 library("data.table")
  68 library("ggplot2")
  69 library("reshape2")
  70 library("grid")
  71 library("xtable")
  72
  73 @
  74
  75 \IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png}
  76
  77     \url{https://xkcd.com/1513/}}}
  78
  79 \frame[plain]{\titlepage
  80   \begin{center}
  81     Code and slides are here:
  82
  83     \qrcode[padding]{http://dla2.us/p/dmnd2015}
  84
  85     \url{http://dla2.us/p/dmnd2015}
  86    \end{center}
  87  }
  88
  89
  90
  91 \section{The Problem}
  92 \begin{frame}{The Problem}
  93   \begin{itemize}
  94   \item mRNA sequences
  95   \item No clear reference genome/database
  96   \item How to annotate them?
  97   \end{itemize}
  98   \only<2>{
  99     \begin{textblock*}{64mm}(32mm,0.15\textheight)
 100       \begin{exampleblock}{No Reference Genome}
 101         \begin{center}
 102           \includegraphics[height=0.3\textheight,keepaspectratio]{figures/spalax.jpg}
 103         \end{center}
 104         \begin{itemize}
 105         \item Blind mole rat (\textit{Spalax})
 106         \item No reference genome
 107         \item Close to mouse/rat, but not close enough to use
 108           mouse/rat directly
 109         \item Placenta sequences; how to annotate them?
 110         \end{itemize}
 111       \end{exampleblock}
 112     \end{textblock*}
 113   }
 114     \only<3>{
 115     \begin{textblock*}{64mm}(32mm,0.25\textheight)
 116       \begin{exampleblock}<3>{Environmental Samples}
 117         \begin{itemize}
 118         \item Permafrost
 119         \item Many un-cultured, un-sequenced bacteria
 120         \item How to annotate what genes they are expressing?
 121         \end{itemize}
 122       \end{exampleblock}
 123     \end{textblock*}
 124     }
 125 \end{frame}
 126
 127 \begin{frame}{Basic Solution}
 128   \begin{itemize}
 129   \item Annotate mRNA against proteins in the nr or in a suitable
 130     reference proteome
 131   \item Six possible translations of nucleotide into amino acid
 132   \item Take all possible sub-sequences
 133   \item Hash into reference, extend match
 134   \item Pick best match
 135   \end{itemize}
 136 \end{frame}
 137
 138 \section{The Previous Contenders}
 139 \begin{frame}{Previous Contenders}
 140   \begin{itemize}
 141   \item Blastx
 142   \item Rapsearch
 143   \item mBlast
 144   \item many, many, more
 145   \end{itemize}
 146 \end{frame}
 147
 148
 149 \section{Diamond Advances}
 150 \begin{frame}{Diamond Methodology Advances}
 151   \begin{itemize}
 152   \item Seed and Extend
 153   \item Reduced Alphabet
 154   \item Spaced Seeds with Specific Seed Shape
 155   \item Double Indexing
 156   \end{itemize}
 157 \end{frame}
 158
 159 \begin{frame}{Seed and Extend}
 160   \begin{itemize}
 161   \item Calculate an index
 162   \item Look up matching indices in the database
 163   \item Local string alignment using Smith-Waterman
 164   \item Looks like blast, right?
 165   \end{itemize}
 166 \end{frame}
 167
 168 \begin{frame}{Reduced Alphabet}
 169
 170   \begin{itemize}
 171   \item \colorbox{red}{LVI} \colorbox{red!10!white}{M} \colorbox{yellow}{C} \colorbox{black!50!white}{G} \colorbox{red!50!blue}{\textcolor{white}{STA}}
 172     \colorbox{green}{P} \colorbox{red!50!blue!50!white}{F} \colorbox{red!50!blue!20!white}{Y} \colorbox{black}{\textcolor{white}{W}} \colorbox{blue}{\textcolor{white}{KREDNQ}}
 173 %-    [KREDQN] [C] [G] [H] [ILV] [M] [F] [Y] [W] [P] [STA].
 174   \item Smaller index sizes --- less memory usage
 175   \item Greater sensitivity --- seed more likely to match
 176   \item More likelihood of useless extensions --- only the seed matched
 177   \end{itemize}
 178   {\centering
 179     \includegraphics[height=0.5\textheight,width=0.8\textwidth,keepaspectratio]{figures/reduced_alphabet.png}
 180
 181     \cite{Murphy.ea2000:Simplifiedaminoacidalphabets}
 182   }
 183
 184 \end{frame}
 185
 186 \begin{frame}{Spaced Seeds with Specific Seed Shape}
 187   \begin{itemize}
 188   \item Spaced seeds are longer seeds in which only a subset of the
 189     positions are used
 190   \item For example, if
 191     \begin{itemize}
 192     \item the sequence was ABCDEFGHI
 193     \item the seed shape was
 194       11100010
 195     \item then you would query into the index with ABCG
 196     \end{itemize}
 197   \item Originally presented in PatternHunter\cite{Ma.ea2002:PatternHunterfasterandmore}
 198   \item Why is this better than consecutive seeds?
 199 \end{itemize}
 200 \end{frame}
 201
 202 \begin{frame}{Consecutive Seeds vs Spaced Seeds}
 203   \begin{itemize}
 204   \item Target Sequence: ABCDEFGHIJK
 205   \item Sequenced Sequence: ABC\textcolor{red}{Z}EF\textcolor{red}{Y}HI\textcolor{red}{X}K
 206   \item Seed Shape: 11100010 (4) and Consecutive: 1111 (4)
 207   \end{itemize}
 208 % ABCZEFGHIYK
 209 % ABCDEFGHIJK
 210 % 11000010---
 211 % -11000010--
 212 % --11000010-
 213 % ---11000010
 214
 215   \begin{columns}
 216     \column{0.6\textwidth}
 217 \begin{block}{Pathological example}
 218   \begin{tabular}{c c c}
 219     Shift & Spaced & Consecutive \\
 220     0     & ABCF=ABCF & ABCD≠ABCZ \\
 221     1     & BCDG≠BCZY & BCDE≠BCZE \\
 222     2     & CDEH≠CZEH & CDEF≠CZEF \\
 223     3     & DEFI≠ZEFI & DEFG≠ZEFY \\
 224     4     & EFGJ≠EFYW & EFGH≠EFYH \\
 225     5     & FGHK≠FYHK & FGHI≠FYHI \\
 226     6     &           & GHIJ≠YHIW \\
 227     7     &           & HIJK≠HIWK \\
 228   \end{tabular}
 229 \end{block}
 230   \column{0.4\textwidth}
 231   \begin{itemize}
 232   \item Spaced seed matches once
 233   \item Consecutive seed never matches
 234   \item Consecutive seed does more comparisons and may match
 235     repeatedly
 236   \end{itemize}
 237 \end{columns}
 238 \end{frame}
 239
 240 \begin{frame}{Optimal Spaced Seed}
 241   \begin{itemize}
 242   \item Fewest overlaps with shifted seed
 243   \item Longer seeds are better
 244   \item Equivalent weight
 245   \item Use dynamic programming to calculate optimal seed for given
 246     length
 247   \end{itemize}
 248   \begin{columns}
 249     \column{0.6\textwidth}
 250   \begin{block}{DIAMOND Seeds (Fast)}
 251     \begin{itemize}
 252     \item 111101011101111 (12)
 253     \item 111011001100101111 (12)
 254     \item 1111001001010001001111 (12)
 255     \item 111100101000010010010111 (12)
 256     \end{itemize}
 257   \end{block}
 258 \end{columns}
 259 \end{frame}
 260
 261 \begin{frame}{Double Indexing}
 262   \begin{itemize}
 263   \item Blastx indexes the database
 264   \item Blastx runs the queries in input order
 265   \item DIAMOND indexes both the database and the queries
 266   \item DIAMOND runs queries in index order
 267   \item Why is this faster?
 268   \end{itemize}
 269 \end{frame}
 270
 271 \begin{frame}{Double Indexing: Why it's faster}
 272   \begin{itemize}
 273   \item Cache architecture
 274     \begin{itemize}
 275     \item On CPU Cache -- L1,L2
 276     \item Shared CPU Cache L3
 277     \item Much faster than main memory
 278     \end{itemize}
 279   \item Each cache miss must hit main memory (must hit northbridge,
 280     which has significantly more latency than main cache, and takes
 281     hundreds of cycles)
 282   \item Dictionary Example: Is it faster to look up
 283     \begin{itemize}
 284     \item “apple”, “xylophone”, “appliance”, “xylem”
 285     \item or “apple”, “appliance”, “xylem”, “xylophone”?
 286     \end{itemize}
 287   \end{itemize}
 288 \end{frame}
 289
 290 \section{Comparison}
 291
 292 \subsection{Speed}
 293
 294 \begin{frame}{Speed of DIAMOND}
 295   \begin{center}
 296     \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_a.jpg}
 297   \end{center}
 298 \end{frame}
 299
 300 \subsection{Accuracy}
 301
 302 \begin{frame}{Accuracy of DIAMOND: Any success}
 303   \begin{center}
 304     \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_b.jpg}
 305   \end{center}
 306 \end{frame}
 307
 308 \begin{frame}{Accuracy of DIAMOND: Matches blastx}
 309   \begin{center}
 310     \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_c.jpg}
 311   \end{center}
 312 \end{frame}
 313
 314 \section{Usage}
 315
 316 \begin{frame}{DIAMOND Usage}
 317   \begin{itemize}
 318   \item Make the diamond database:
 319     \texttt{diamond makedb --in foo.fasta --db foo.dmnd;}
 320   \item Run the diamond query:
 321     \texttt{diamond blastx --db foo.diamond --threads 24 --query bar.fasta --daa bar\_diamond.txt}
 322   \end{itemize}
 323 \end{frame}
 324
 325
 326 \subsection{Output}
 327 \begin{frame}{DIAMOND Output}
 328   \begin{itemize}
 329   \item Standard BLASTx output
 330   \item Equivalent evalues and bit scores
 331   \item An example from \textit{Spalax} (the top two proteins are
 332     isoforms):
 333   \end{itemize}
 334   \tiny
 335   \begin{tabular}{c c c c c c c c c c c c}
 336     query & match & \% ident & length & \# mm & gap & qst & qstp & sstart & sstop & evalue & score \\
 337 c18\_g1\_i1 & ...065786 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 992 & 1352 & 5.7e-203 & 704.9 \\
 338 c18\_g1\_i1 & ...081540 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 940 & 1300 & 5.7e-203 & 704.9 \\
 339 c18\_g1\_i1 & ...142322 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 944 & 1300 & 5.9e-99 & 359.4 \\
 340 c18\_g1\_i1 & ...039711 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 936 & 1292 & 5.9e-99 & 359.4 \\
 341 c18\_g1\_i1 & ...141518 & 43.0 & 230 & 124 & 3 & 5 & 685 & 936 & 1161 & 1.7e-50 & 198.4 \\
 342 \end{tabular}
 343 \end{frame}
 344
 345
 346
 347 \section*{References}
 348
 349 \begin{frame}[plain]{References}
 350   \begin{center}
 351     \mbox{}\vspace{-\baselineskip}
 352     \printbibliography[heading=none]
 353   \end{center}
 354 \end{frame}
 355
 356 \end{document}