From 64e613877692f5f83838ba3ab6ca4b9871ec42dc Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Sun, 18 Oct 2015 16:59:41 -0700 Subject: [PATCH] add lots of slides discussing diamond and how it works --- diamond_presentation_2015.Rnw | 258 ++++++++++++++++++++++++++++++---- 1 file changed, 230 insertions(+), 28 deletions(-) diff --git a/diamond_presentation_2015.Rnw b/diamond_presentation_2015.Rnw index d3fda96..6a90f32 100644 --- a/diamond_presentation_2015.Rnw +++ b/diamond_presentation_2015.Rnw @@ -28,41 +28,24 @@ % The textpos package is necessary to position textblocks at arbitary % places on the page. Use showboxes option to show outlines of textboxes. % \usepackage[absolute]{textpos} -\usepackage[absolute,showboxes]{textpos} +\usepackage[absolute,overlay]{textpos} +\usepackage{mathtools,cancel} + +\renewcommand{\CancelColor}{\color{red}} %change cancel color to red \usepackage{multirow} \usepackage{array} \mode{ \usetheme{CambridgeUS} + \usecolortheme{crane} % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html \definecolor{ilboldblue}{HTML}{002058} \definecolor{ilboldorange}{HTML}{E87722} \definecolor{ilblue}{HTML}{606EB2} \definecolor{ilorange}{HTML}{D45D00} - \setbeamercolor{alerted text}{fg=ilboldorange} - \setbeamercolor*{palette primary}{fg=ilblue,bg=ilorange} - \setbeamercolor*{palette secondary}{fg=ilblue!20!white,bg=ilorange} - \setbeamercolor*{palette tertiary}{bg=ilblue,fg=ilorange} - \setbeamercolor*{palette quaternary}{fg=ilblue,bg=ilorange} - - \setbeamercolor*{sidebar}{fg=ilorange,bg=ilboldblue} - - \setbeamercolor*{palette sidebar primary}{fg=ilblue!10!white,bg=ilorange} - \setbeamercolor*{palette sidebar secondary}{fg=ilorange} - \setbeamercolor*{palette sidebar tertiary}{fg=ilblue} - \setbeamercolor*{palette sidebar quaternary}{fg=ilorange} - - % \setbeamercolor*{titlelike}{parent=palette primary} - \setbeamercolor{titlelike}{parent=palette primary,fg=ilboldblue,bg=ilorange} - \setbeamercolor{frametitle}{fg=ilboldorange,bg=ilblue!80!white} - \setbeamercolor{frametitle right}{fg=ilboldblue,bg=ilorange} - - \setbeamercolor*{separation line}{} - \setbeamercolor*{fine separation line}{} - \setbeamercovered{transparent} \logo{\begin{tikzpicture}% Pale figure - {\node[opacity=0.8]{\IfFileExists{figures/igb_wordmark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/igb_wordmark}}{}% + {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}% };}% \end{tikzpicture}} } @@ -93,27 +76,246 @@ library("xtable") \url{https://xkcd.com/1513/}}} -\frame[plain]{\titlepage} +\frame[plain]{\titlepage + \begin{center} + Code and slides are here: + + \qrcode[padding]{http://dla2.us/p/dmnd2015} + + \url{http://dla2.us/p/dmnd2015} + \end{center} + } \section{The Problem} +\begin{frame}{The Problem} + \begin{itemize} + \item mRNA sequences + \item No clear reference genome/database + \item How to annotate them? + \end{itemize} + \only<2>{ + \begin{textblock*}{64mm}(32mm,0.15\textheight) + \begin{exampleblock}{No Reference Genome} + \begin{center} + \includegraphics[height=0.3\textheight,keepaspectratio]{figures/spalax.jpg} + \end{center} + \begin{itemize} + \item Blind mole rat (\textit{Spalax}) + \item No reference genome + \item Close to mouse/rat, but not close enough to use + mouse/rat directly + \item Placenta sequences; how to annotate them? + \end{itemize} + \end{exampleblock} + \end{textblock*} + } + \only<3>{ + \begin{textblock*}{64mm}(32mm,0.25\textheight) + \begin{exampleblock}<3>{Environmental Samples} + \begin{itemize} + \item Permafrost + \item Many un-cultured, un-sequenced bacteria + \item How to annotate what genes they are expressing? + \end{itemize} + \end{exampleblock} + \end{textblock*} + } +\end{frame} + +\begin{frame}{Basic Solution} + \begin{itemize} + \item Annotate mRNA against proteins in the nr or in a suitable + reference proteome + \item Six possible translations of nucleotide into amino acid + \item Take all possible sub-sequences + \item Hash into reference, extend match + \item Pick best match + \end{itemize} +\end{frame} \section{The Previous Contenders} +\begin{frame}{Previous Contenders} + \begin{itemize} + \item Blastx + \item Rapsearch + \item mBlast + \item many, many, more + \end{itemize} +\end{frame} + + +\section{Diamond Advances} +\begin{frame}{Diamond Methodology Advances} + \begin{itemize} + \item Seed and Extend + \item Reduced Alphabet + \item Spaced Seeds with Specific Seed Shape + \item Double Indexing + \end{itemize} +\end{frame} + +\begin{frame}{Seed and Extend} + \begin{itemize} + \item Calculate an index + \item Look up matching indices in the database + \item Local string alignment using Smith-Waterman + \item Looks like blast, right? + \end{itemize} +\end{frame} + +\begin{frame}{Reduced Alphabet} + + \begin{itemize} + \item \colorbox{red}{LVI} \colorbox{red!10!white}{M} \colorbox{yellow}{C} \colorbox{black!50!white}{G} \colorbox{red!50!blue}{\textcolor{white}{STA}} + \colorbox{green}{P} \colorbox{red!50!blue!50!white}{F} \colorbox{red!50!blue!20!white}{Y} \colorbox{black}{\textcolor{white}{W}} \colorbox{blue}{\textcolor{white}{KREDNQ}} +%- [KREDQN] [C] [G] [H] [ILV] [M] [F] [Y] [W] [P] [STA]. + \item Smaller index sizes --- less memory usage + \item Greater sensitivity --- seed more likely to match + \item More likelihood of useless extensions --- only the seed matched + \end{itemize} + {\centering + \includegraphics[height=0.5\textheight,width=0.8\textwidth,keepaspectratio]{figures/reduced_alphabet.png} + + \cite{Murphy.ea2000:Simplifiedaminoacidalphabets} + } + +\end{frame} -\section{Diamond Methodology} -\begin{frame}{Diamond Methodology} +\begin{frame}{Spaced Seeds with Specific Seed Shape} \begin{itemize} - \item - \end{itemize} + \item Spaced seeds are longer seeds in which only a subset of the + positions are used + \item For example, if + \begin{itemize} + \item the sequence was ABCDEFGHI + \item the seed shape was + 11100010 + \item then you would query into the index with ABCG + \end{itemize} + \item Originally presented in PatternHunter\cite{Ma.ea2002:PatternHunterfasterandmore} + \item Why is this better than consecutive seeds? +\end{itemize} +\end{frame} + +\begin{frame}{Consecutive Seeds vs Spaced Seeds} + \begin{itemize} + \item Target Sequence: ABCDEFGHIJK + \item Sequenced Sequence: ABC\textcolor{red}{Z}EF\textcolor{red}{Y}HI\textcolor{red}{X}K + \item Seed Shape: 11100010 (4) and Consecutive: 1111 (4) + \end{itemize} +% ABCZEFGHIYK +% ABCDEFGHIJK +% 11000010--- +% -11000010-- +% --11000010- +% ---11000010 + + \begin{columns} + \column{0.6\textwidth} +\begin{block}{Pathological example} + \begin{tabular}{c c c} + Shift & Spaced & Consecutive \\ + 0 & ABCF=ABCF & ABCD≠ABCZ \\ + 1 & BCDG≠BCZY & BCDE≠BCZE \\ + 2 & CDEH≠CZEH & CDEF≠CZEF \\ + 3 & DEFI≠ZEFI & DEFG≠ZEFY \\ + 4 & EFGJ≠EFYW & EFGH≠EFYH \\ + 5 & FGHK≠FYHK & FGHI≠FYHI \\ + 6 & & GHIJ≠YHIW \\ + 7 & & HIJK≠HIWK \\ + \end{tabular} +\end{block} + \column{0.4\textwidth} + \begin{itemize} + \item Spaced seed matches once + \item Consecutive seed never matches + \item Consecutive seed does more comparisons and may match + repeatedly + \end{itemize} +\end{columns} +\end{frame} + +\begin{frame}{Optimal Spaced Seed} + \begin{itemize} + \item Fewest overlaps with shifted seed + \item Longer seeds are better + \item Equivalent weight + \item Use dynamic programming to calculate optimal seed for given + length + \end{itemize} + \begin{columns} + \column{0.6\textwidth} + \begin{block}{DIAMOND Seeds (Fast)} + \begin{itemize} + \item 111101011101111 (12) + \item 111011001100101111 (12) + \item 1111001001010001001111 (12) + \item 111100101000010010010111 (12) + \end{itemize} + \end{block} +\end{columns} +\end{frame} + +\begin{frame}{Double Indexing} + \begin{itemize} + \item Blastx indexes the database + \item Blastx runs the queries in input order + \item DIAMOND indexes both the database and the queries + \item DIAMOND runs queries in index order + \item Why is this faster? + \end{itemize} +\end{frame} + +\begin{frame}{Double Indexing: Why it's faster} + \begin{itemize} + \item Cache architecture + \begin{itemize} + \item On CPU Cache -- L1,L2 + \item Shared CPU Cache L3 + \item Much faster than main memory + \end{itemize} + \item Each cache miss must hit main memory (must hit northbridge, + which has significantly more latency than main cache, and takes + hundreds of cycles) + \item Dictionary Example: Is it faster to look up + \begin{itemize} + \item “apple”, “xylophone”, “appliance”, “xylem” + \item or “apple”, “appliance”, “xylem”, “xylophone”? + \end{itemize} + \end{itemize} +\end{frame} + +\section{Usage} + +\begin{frame}{DIAMOND Usage} + \begin{itemize} + \item Make the diamond database: + \texttt{diamond makedb --in foo.fasta --db foo.dmnd;} + \item Run the diamond query: + \texttt{diamond blastx --db foo.diamond --threads 24 --query bar.fasta --daa bar\_diamond.txt} + \end{itemize} \end{frame} \section{Comparison} \subsection{Speed} +\begin{frame}{Speed of DIAMOND} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_a.jpg} +\end{frame} + \subsection{Accuracy} +\begin{frame}{Accuracy of DIAMOND: Any success} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_b.jpg} +\end{frame} + +\begin{frame}{Accuracy of DIAMOND: Matches blastx} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_c.jpg} +\end{frame} + \section*{References} -- 2.39.2