1 \documentclass[ignorenonframetext]{beamer}
3 \setmainfont{FreeSerif}
9 \usepackage[bf]{caption}
14 % \usepackage{multirow}
17 \usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex}
18 \addbibresource{references.bib}
19 % \usepackage[nomargin,inline,draft]{fixme}
20 % \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}}
21 % \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref}
25 \usepackage{zref-xr,zref-user}
26 \renewcommand*{\bibfont}{\tiny}
28 % The textpos package is necessary to position textblocks at arbitary
29 % places on the page. Use showboxes option to show outlines of textboxes.
30 % \usepackage[absolute]{textpos}
31 \usepackage[absolute,overlay]{textpos}
32 \usepackage{mathtools,cancel}
34 \renewcommand{\CancelColor}{\color{red}} %change cancel color to red
40 \usetheme{CambridgeUS}
42 % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html
43 \definecolor{ilboldblue}{HTML}{002058}
44 \definecolor{ilboldorange}{HTML}{E87722}
45 \definecolor{ilblue}{HTML}{606EB2}
46 \definecolor{ilorange}{HTML}{D45D00}
47 \logo{\begin{tikzpicture}% Pale figure
48 {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}%
53 \title[DIAMOND]{DIAMOND: Fast protein alignment}
54 \author[Don Armstrong]{Don L. Armstrong}
55 \institute[IGB]{Institute for Genomic Biology, Computing Genomes
56 for Reproductive Health, University of Illinois, Urbana-Champaign}
60 <<load.libraries,echo=FALSE,results="hide",warning=FALSE,message=FALSE,error=FALSE,cache=FALSE>>=
61 opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio")
62 #opts_chunk$set(cache=TRUE, autodep=TRUE)
63 options(device = function(file, width = 8, height = 7, ...) {
64 cairo_pdf(tempfile(), width = width, height = height, ...)
75 \IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png}
77 \url{https://xkcd.com/1513/}}}
79 \frame[plain]{\titlepage
81 Code and slides are here:
83 \qrcode[padding]{http://dla2.us/p/dmnd2015}
85 \url{http://dla2.us/p/dmnd2015}
92 \begin{frame}{The Problem}
95 \item No clear reference genome/database
96 \item How to annotate them?
99 \begin{textblock*}{64mm}(32mm,0.15\textheight)
100 \begin{exampleblock}{No Reference Genome}
102 \includegraphics[height=0.3\textheight,keepaspectratio]{figures/spalax.jpg}
105 \item Blind mole rat (\textit{Spalax})
106 \item No reference genome
107 \item Close to mouse/rat, but not close enough to use
109 \item Placenta sequences; how to annotate them?
115 \begin{textblock*}{64mm}(32mm,0.25\textheight)
116 \begin{exampleblock}<3>{Environmental Samples}
119 \item Many un-cultured, un-sequenced bacteria
120 \item How to annotate what genes they are expressing?
127 \begin{frame}{Basic Solution}
129 \item Annotate mRNA against proteins in the nr or in a suitable
131 \item Six possible translations of nucleotide into amino acid
132 \item Take all possible sub-sequences
133 \item Hash into reference, extend match
134 \item Pick best match
138 \section{The Previous Contenders}
139 \begin{frame}{Previous Contenders}
144 \item many, many, more
149 \section{Diamond Advances}
150 \begin{frame}{Diamond Methodology Advances}
152 \item Seed and Extend
153 \item Reduced Alphabet
154 \item Spaced Seeds with Specific Seed Shape
155 \item Double Indexing
159 \begin{frame}{Seed and Extend}
161 \item Calculate an index
162 \item Look up matching indices in the database
163 \item Local string alignment using Smith-Waterman
164 \item Looks like blast, right?
168 \begin{frame}{Reduced Alphabet}
171 \item \colorbox{red}{LVI} \colorbox{red!10!white}{M} \colorbox{yellow}{C} \colorbox{black!50!white}{G} \colorbox{red!50!blue}{\textcolor{white}{STA}}
172 \colorbox{green}{P} \colorbox{red!50!blue!50!white}{F} \colorbox{red!50!blue!20!white}{Y} \colorbox{black}{\textcolor{white}{W}} \colorbox{blue}{\textcolor{white}{KREDNQ}}
173 %- [KREDQN] [C] [G] [H] [ILV] [M] [F] [Y] [W] [P] [STA].
174 \item Smaller index sizes --- less memory usage
175 \item Greater sensitivity --- seed more likely to match
176 \item More likelihood of useless extensions --- only the seed matched
179 \includegraphics[height=0.5\textheight,width=0.8\textwidth,keepaspectratio]{figures/reduced_alphabet.png}
181 \cite{Murphy.ea2000:Simplifiedaminoacidalphabets}
186 \begin{frame}{Spaced Seeds with Specific Seed Shape}
188 \item Spaced seeds are longer seeds in which only a subset of the
190 \item For example, if
192 \item the sequence was ABCDEFGHI
193 \item the seed shape was
195 \item then you would query into the index with ABCG
197 \item Originally presented in PatternHunter\cite{Ma.ea2002:PatternHunterfasterandmore}
198 \item Why is this better than consecutive seeds?
202 \begin{frame}{Consecutive Seeds vs Spaced Seeds}
204 \item Target Sequence: ABCDEFGHIJK
205 \item Sequenced Sequence: ABC\textcolor{red}{Z}EF\textcolor{red}{Y}HI\textcolor{red}{X}K
206 \item Seed Shape: 11100010 (4) and Consecutive: 1111 (4)
216 \column{0.6\textwidth}
217 \begin{block}{Pathological example}
218 \begin{tabular}{c c c}
219 Shift & Spaced & Consecutive \\
220 0 & ABCF=ABCF & ABCD≠ABCZ \\
221 1 & BCDG≠BCZY & BCDE≠BCZE \\
222 2 & CDEH≠CZEH & CDEF≠CZEF \\
223 3 & DEFI≠ZEFI & DEFG≠ZEFY \\
224 4 & EFGJ≠EFYW & EFGH≠EFYH \\
225 5 & FGHK≠FYHK & FGHI≠FYHI \\
230 \column{0.4\textwidth}
232 \item Spaced seed matches once
233 \item Consecutive seed never matches
234 \item Consecutive seed does more comparisons and may match
240 \begin{frame}{Optimal Spaced Seed}
242 \item Fewest overlaps with shifted seed
243 \item Longer seeds are better
244 \item Equivalent weight
245 \item Use dynamic programming to calculate optimal seed for given
249 \column{0.6\textwidth}
250 \begin{block}{DIAMOND Seeds (Fast)}
252 \item 111101011101111 (12)
253 \item 111011001100101111 (12)
254 \item 1111001001010001001111 (12)
255 \item 111100101000010010010111 (12)
261 \begin{frame}{Double Indexing}
263 \item Blastx indexes the database
264 \item Blastx runs the queries in input order
265 \item DIAMOND indexes both the database and the queries
266 \item DIAMOND runs queries in index order
267 \item Why is this faster?
271 \begin{frame}{Double Indexing: Why it's faster}
273 \item Cache architecture
275 \item On CPU Cache -- L1,L2
276 \item Shared CPU Cache L3
277 \item Much faster than main memory
279 \item Each cache miss must hit main memory (must hit northbridge,
280 which has significantly more latency than main cache, and takes
282 \item Dictionary Example: Is it faster to look up
284 \item “apple”, “xylophone”, “appliance”, “xylem”
285 \item or “apple”, “appliance”, “xylem”, “xylophone”?
294 \begin{frame}{Speed of DIAMOND}
296 \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_a.jpg}
300 \subsection{Accuracy}
302 \begin{frame}{Accuracy of DIAMOND: Any success}
304 \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_b.jpg}
308 \begin{frame}{Accuracy of DIAMOND: Matches blastx}
310 \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_c.jpg}
316 \begin{frame}{DIAMOND Usage}
318 \item Make the diamond database:
319 \texttt{diamond makedb --in foo.fasta --db foo.dmnd;}
320 \item Run the diamond query:
321 \texttt{diamond blastx --db foo.diamond --threads 24 --query bar.fasta --daa bar\_diamond.txt}
327 \begin{frame}{DIAMOND Output}
329 \item Standard BLASTx output
330 \item Equivalent evalues and bit scores
331 \item An example from \textit{Spalax} (the top two proteins are
335 \begin{tabular}{c c c c c c c c c c c c}
336 query & match & \% ident & length & \# mm & gap & qst & qstp & sstart & sstop & evalue & score \\
337 c18\_g1\_i1 & ...065786 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 992 & 1352 & 5.7e-203 & 704.9 \\
338 c18\_g1\_i1 & ...081540 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 940 & 1300 & 5.7e-203 & 704.9 \\
339 c18\_g1\_i1 & ...142322 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 944 & 1300 & 5.9e-99 & 359.4 \\
340 c18\_g1\_i1 & ...039711 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 936 & 1292 & 5.9e-99 & 359.4 \\
341 c18\_g1\_i1 & ...141518 & 43.0 & 230 & 124 & 3 & 5 & 685 & 936 & 1161 & 1.7e-50 & 198.4 \\
347 \section*{References}
349 \begin{frame}[plain]{References}
351 \mbox{}\vspace{-\baselineskip}
352 \printbibliography[heading=none]