\documentclass[ignorenonframetext]{beamer} \usepackage{fontspec} \setmainfont{FreeSerif} \setsansfont{FreeSans} \setmonofont{FreeMono} \usepackage{url} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage[bf]{caption} \usepackage{rotating} \usepackage{wrapfig} \usepackage{fancybox} \usepackage{booktabs} % \usepackage{multirow} \usepackage{acronym} \usepackage{qrcode} \usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex} \addbibresource{references.bib} % \usepackage[nomargin,inline,draft]{fixme} % \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}} % \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref} \usepackage{texshade} \usepackage{tikz} \usepackage{nameref} \usepackage{zref-xr,zref-user} \renewcommand*{\bibfont}{\tiny} % The textpos package is necessary to position textblocks at arbitary % places on the page. Use showboxes option to show outlines of textboxes. % \usepackage[absolute]{textpos} \usepackage[absolute,overlay]{textpos} \usepackage{mathtools,cancel} \renewcommand{\CancelColor}{\color{red}} %change cancel color to red \usepackage{multirow} \usepackage{array} \mode{ \usetheme{CambridgeUS} \usecolortheme{crane} % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html \definecolor{ilboldblue}{HTML}{002058} \definecolor{ilboldorange}{HTML}{E87722} \definecolor{ilblue}{HTML}{606EB2} \definecolor{ilorange}{HTML}{D45D00} \logo{\begin{tikzpicture}% Pale figure {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}% };}% \end{tikzpicture}} } \title[DIAMOND]{DIAMOND: Fast protein alignment} \author[Don Armstrong]{Don L. Armstrong} \institute[IGB]{Institute for Genomic Biology, Computing Genomes for Reproductive Health, University of Illinois, Urbana-Champaign} \begin{document} <>= opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio") #opts_chunk$set(cache=TRUE, autodep=TRUE) options(device = function(file, width = 8, height = 7, ...) { cairo_pdf(tempfile(), width = width, height = height, ...) }) options(digits=2) library("data.table") library("ggplot2") library("reshape2") library("grid") library("xtable") @ \IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png} \url{https://xkcd.com/1513/}}} \frame[plain]{\titlepage \begin{center} Code and slides are here: \qrcode[padding]{http://dla2.us/p/dmnd2015} \url{http://dla2.us/p/dmnd2015} \end{center} } \section{The Problem} \begin{frame}{The Problem} \begin{itemize} \item mRNA sequences \item No clear reference genome/database \item How to annotate them? \end{itemize} \only<2>{ \begin{textblock*}{64mm}(32mm,0.15\textheight) \begin{exampleblock}{No Reference Genome} \begin{center} \includegraphics[height=0.3\textheight,keepaspectratio]{figures/spalax.jpg} \end{center} \begin{itemize} \item Blind mole rat (\textit{Spalax}) \item No reference genome \item Close to mouse/rat, but not close enough to use mouse/rat directly \item Placenta sequences; how to annotate them? \end{itemize} \end{exampleblock} \end{textblock*} } \only<3>{ \begin{textblock*}{64mm}(32mm,0.25\textheight) \begin{exampleblock}<3>{Environmental Samples} \begin{itemize} \item Permafrost \item Many un-cultured, un-sequenced bacteria \item How to annotate what genes they are expressing? \end{itemize} \end{exampleblock} \end{textblock*} } \end{frame} \begin{frame}{Basic Solution} \begin{itemize} \item Annotate mRNA against proteins in the nr or in a suitable reference proteome \item Six possible translations of nucleotide into amino acid \item Take all possible sub-sequences \item Hash into reference, extend match \item Pick best match \end{itemize} \end{frame} \section{The Previous Contenders} \begin{frame}{Previous Contenders} \begin{itemize} \item Blastx \item Rapsearch \item mBlast \item many, many, more \end{itemize} \end{frame} \section{Diamond Advances} \begin{frame}{Diamond Methodology Advances} \begin{itemize} \item Seed and Extend \item Reduced Alphabet \item Spaced Seeds with Specific Seed Shape \item Double Indexing \end{itemize} \end{frame} \begin{frame}{Seed and Extend} \begin{itemize} \item Calculate an index \item Look up matching indices in the database \item Local string alignment using Smith-Waterman \item Looks like blast, right? \end{itemize} \end{frame} \begin{frame}{Reduced Alphabet} \begin{itemize} \item \colorbox{red}{LVI} \colorbox{red!10!white}{M} \colorbox{yellow}{C} \colorbox{black!50!white}{G} \colorbox{red!50!blue}{\textcolor{white}{STA}} \colorbox{green}{P} \colorbox{red!50!blue!50!white}{F} \colorbox{red!50!blue!20!white}{Y} \colorbox{black}{\textcolor{white}{W}} \colorbox{blue}{\textcolor{white}{KREDNQ}} %- [KREDQN] [C] [G] [H] [ILV] [M] [F] [Y] [W] [P] [STA]. \item Smaller index sizes --- less memory usage \item Greater sensitivity --- seed more likely to match \item More likelihood of useless extensions --- only the seed matched \end{itemize} {\centering \includegraphics[height=0.5\textheight,width=0.8\textwidth,keepaspectratio]{figures/reduced_alphabet.png} \cite{Murphy.ea2000:Simplifiedaminoacidalphabets} } \end{frame} \begin{frame}{Spaced Seeds with Specific Seed Shape} \begin{itemize} \item Spaced seeds are longer seeds in which only a subset of the positions are used \item For example, if \begin{itemize} \item the sequence was ABCDEFGHI \item the seed shape was 11100010 \item then you would query into the index with ABCG \end{itemize} \item Originally presented in PatternHunter\cite{Ma.ea2002:PatternHunterfasterandmore} \item Why is this better than consecutive seeds? \end{itemize} \end{frame} \begin{frame}{Consecutive Seeds vs Spaced Seeds} \begin{itemize} \item Target Sequence: ABCDEFGHIJK \item Sequenced Sequence: ABC\textcolor{red}{Z}EF\textcolor{red}{Y}HI\textcolor{red}{X}K \item Seed Shape: 11100010 (4) and Consecutive: 1111 (4) \end{itemize} % ABCZEFGHIYK % ABCDEFGHIJK % 11000010--- % -11000010-- % --11000010- % ---11000010 \begin{columns} \column{0.6\textwidth} \begin{block}{Pathological example} \begin{tabular}{c c c} Shift & Spaced & Consecutive \\ 0 & ABCF=ABCF & ABCD≠ABCZ \\ 1 & BCDG≠BCZY & BCDE≠BCZE \\ 2 & CDEH≠CZEH & CDEF≠CZEF \\ 3 & DEFI≠ZEFI & DEFG≠ZEFY \\ 4 & EFGJ≠EFYW & EFGH≠EFYH \\ 5 & FGHK≠FYHK & FGHI≠FYHI \\ 6 & & GHIJ≠YHIW \\ 7 & & HIJK≠HIWK \\ \end{tabular} \end{block} \column{0.4\textwidth} \begin{itemize} \item Spaced seed matches once \item Consecutive seed never matches \item Consecutive seed does more comparisons and may match repeatedly \end{itemize} \end{columns} \end{frame} \begin{frame}{Optimal Spaced Seed} \begin{itemize} \item Fewest overlaps with shifted seed \item Longer seeds are better \item Equivalent weight \item Use dynamic programming to calculate optimal seed for given length \end{itemize} \begin{columns} \column{0.6\textwidth} \begin{block}{DIAMOND Seeds (Fast)} \begin{itemize} \item 111101011101111 (12) \item 111011001100101111 (12) \item 1111001001010001001111 (12) \item 111100101000010010010111 (12) \end{itemize} \end{block} \end{columns} \end{frame} \begin{frame}{Double Indexing} \begin{itemize} \item Blastx indexes the database \item Blastx runs the queries in input order \item DIAMOND indexes both the database and the queries \item DIAMOND runs queries in index order \item Why is this faster? \end{itemize} \end{frame} \begin{frame}{Double Indexing: Why it's faster} \begin{itemize} \item Cache architecture \begin{itemize} \item On CPU Cache -- L1,L2 \item Shared CPU Cache L3 \item Much faster than main memory \end{itemize} \item Each cache miss must hit main memory (must hit northbridge, which has significantly more latency than main cache, and takes hundreds of cycles) \item Dictionary Example: Is it faster to look up \begin{itemize} \item “apple”, “xylophone”, “appliance”, “xylem” \item or “apple”, “appliance”, “xylem”, “xylophone”? \end{itemize} \end{itemize} \end{frame} \section{Comparison} \subsection{Speed} \begin{frame}{Speed of DIAMOND} \begin{center} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_a.jpg} \end{center} \end{frame} \subsection{Accuracy} \begin{frame}{Accuracy of DIAMOND: Any success} \begin{center} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_b.jpg} \end{center} \end{frame} \begin{frame}{Accuracy of DIAMOND: Matches blastx} \begin{center} \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_c.jpg} \end{center} \end{frame} \section{Usage} \begin{frame}{DIAMOND Usage} \begin{itemize} \item Make the diamond database: \texttt{diamond makedb --in foo.fasta --db foo.dmnd;} \item Run the diamond query: \texttt{diamond blastx --db foo.diamond --threads 24 --query bar.fasta --daa bar\_diamond.txt} \end{itemize} \end{frame} \subsection{Output} \begin{frame}{DIAMOND Output} \begin{itemize} \item Standard BLASTx output \item Equivalent evalues and bit scores \item An example from \textit{Spalax} (the top two proteins are isoforms): \end{itemize} \tiny \begin{tabular}{c c c c c c c c c c c c} query & match & \% ident & length & \# mm & gap & qst & qstp & sstart & sstop & evalue & score \\ c18\_g1\_i1 & ...065786 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 992 & 1352 & 5.7e-203 & 704.9 \\ c18\_g1\_i1 & ...081540 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 940 & 1300 & 5.7e-203 & 704.9 \\ c18\_g1\_i1 & ...142322 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 944 & 1300 & 5.9e-99 & 359.4 \\ c18\_g1\_i1 & ...039711 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 936 & 1292 & 5.9e-99 & 359.4 \\ c18\_g1\_i1 & ...141518 & 43.0 & 230 & 124 & 3 & 5 & 685 & 936 & 1161 & 1.7e-50 & 198.4 \\ \end{tabular} \end{frame} \section*{References} \begin{frame}[plain]{References} \begin{center} \mbox{}\vspace{-\baselineskip} \printbibliography[heading=none] \end{center} \end{frame} \end{document}