]> git.donarmstrong.com Git - diamond_presentation.git/commitdiff
add lots of slides discussing diamond and how it works
authorDon Armstrong <don@donarmstrong.com>
Sun, 18 Oct 2015 23:59:41 +0000 (16:59 -0700)
committerDon Armstrong <don@donarmstrong.com>
Sun, 18 Oct 2015 23:59:41 +0000 (16:59 -0700)
diamond_presentation_2015.Rnw

index d3fda9616138212e7cf015184419f2607b7c7205..6a90f3232aab9ac1f1b3ad9978bdad2a35e50359 100644 (file)
 % The textpos package is necessary to position textblocks at arbitary 
 % places on the page.  Use showboxes option to show outlines of textboxes.
 % \usepackage[absolute]{textpos}
-\usepackage[absolute,showboxes]{textpos}
+\usepackage[absolute,overlay]{textpos}
+\usepackage{mathtools,cancel}
+
+\renewcommand{\CancelColor}{\color{red}} %change cancel color to red
 
 \usepackage{multirow}
 \usepackage{array}
 
 \mode<presentation>{ 
   \usetheme{CambridgeUS}
+  \usecolortheme{crane}
   % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html
   \definecolor{ilboldblue}{HTML}{002058}
   \definecolor{ilboldorange}{HTML}{E87722}
   \definecolor{ilblue}{HTML}{606EB2}
   \definecolor{ilorange}{HTML}{D45D00}
-  \setbeamercolor{alerted text}{fg=ilboldorange}
-  \setbeamercolor*{palette primary}{fg=ilblue,bg=ilorange}
-  \setbeamercolor*{palette secondary}{fg=ilblue!20!white,bg=ilorange}
-  \setbeamercolor*{palette tertiary}{bg=ilblue,fg=ilorange}
-  \setbeamercolor*{palette quaternary}{fg=ilblue,bg=ilorange}
-  
-  \setbeamercolor*{sidebar}{fg=ilorange,bg=ilboldblue}
-  
-  \setbeamercolor*{palette sidebar primary}{fg=ilblue!10!white,bg=ilorange}
-  \setbeamercolor*{palette sidebar secondary}{fg=ilorange}
-  \setbeamercolor*{palette sidebar tertiary}{fg=ilblue}
-  \setbeamercolor*{palette sidebar quaternary}{fg=ilorange}
-  
-  % \setbeamercolor*{titlelike}{parent=palette primary}
-  \setbeamercolor{titlelike}{parent=palette primary,fg=ilboldblue,bg=ilorange}
-  \setbeamercolor{frametitle}{fg=ilboldorange,bg=ilblue!80!white}
-  \setbeamercolor{frametitle right}{fg=ilboldblue,bg=ilorange}
-  
-  \setbeamercolor*{separation line}{}
-  \setbeamercolor*{fine separation line}{}
-  \setbeamercovered{transparent}  
   \logo{\begin{tikzpicture}% Pale figure
-      {\node[opacity=0.8]{\IfFileExists{figures/igb_wordmark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/igb_wordmark}}{}%
+      {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}%
         };}%
     \end{tikzpicture}}
 }
@@ -93,27 +76,246 @@ library("xtable")
 
     \url{https://xkcd.com/1513/}}}
 
-\frame[plain]{\titlepage}
+\frame[plain]{\titlepage
+  \begin{center}
+    Code and slides are here: 
+    
+    \qrcode[padding]{http://dla2.us/p/dmnd2015}
+    
+    \url{http://dla2.us/p/dmnd2015}
+   \end{center}
+ }
 
 
 
 \section{The Problem}
+\begin{frame}{The Problem}
+  \begin{itemize}
+  \item mRNA sequences
+  \item No clear reference genome/database
+  \item How to annotate them?
+  \end{itemize}
+  \only<2>{
+    \begin{textblock*}{64mm}(32mm,0.15\textheight)
+      \begin{exampleblock}{No Reference Genome}
+        \begin{center}
+          \includegraphics[height=0.3\textheight,keepaspectratio]{figures/spalax.jpg}
+        \end{center}
+        \begin{itemize}
+        \item Blind mole rat (\textit{Spalax})
+        \item No reference genome
+        \item Close to mouse/rat, but not close enough to use
+          mouse/rat directly
+        \item Placenta sequences; how to annotate them?
+        \end{itemize}
+      \end{exampleblock}
+    \end{textblock*}
+  }
+    \only<3>{
+    \begin{textblock*}{64mm}(32mm,0.25\textheight)
+      \begin{exampleblock}<3>{Environmental Samples}
+        \begin{itemize}
+        \item Permafrost
+        \item Many un-cultured, un-sequenced bacteria
+        \item How to annotate what genes they are expressing?
+        \end{itemize}
+      \end{exampleblock}
+    \end{textblock*}
+    }
+\end{frame}
+
+\begin{frame}{Basic Solution}
+  \begin{itemize}
+  \item Annotate mRNA against proteins in the nr or in a suitable
+    reference proteome
+  \item Six possible translations of nucleotide into amino acid
+  \item Take all possible sub-sequences
+  \item Hash into reference, extend match
+  \item Pick best match
+  \end{itemize}
+\end{frame}
 
 \section{The Previous Contenders}
+\begin{frame}{Previous Contenders}
+  \begin{itemize}
+  \item Blastx
+  \item Rapsearch
+  \item mBlast
+  \item many, many, more
+  \end{itemize}
+\end{frame}
+
+
+\section{Diamond Advances}
+\begin{frame}{Diamond Methodology Advances}
+  \begin{itemize}
+  \item Seed and Extend
+  \item Reduced Alphabet
+  \item Spaced Seeds with Specific Seed Shape
+  \item Double Indexing
+  \end{itemize}  
+\end{frame}
+
+\begin{frame}{Seed and Extend}
+  \begin{itemize}
+  \item Calculate an index
+  \item Look up matching indices in the database
+  \item Local string alignment using Smith-Waterman
+  \item Looks like blast, right?
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Reduced Alphabet}
+
+  \begin{itemize}
+  \item \colorbox{red}{LVI} \colorbox{red!10!white}{M} \colorbox{yellow}{C} \colorbox{black!50!white}{G} \colorbox{red!50!blue}{\textcolor{white}{STA}} 
+    \colorbox{green}{P} \colorbox{red!50!blue!50!white}{F} \colorbox{red!50!blue!20!white}{Y} \colorbox{black}{\textcolor{white}{W}} \colorbox{blue}{\textcolor{white}{KREDNQ}}
+%-    [KREDQN] [C] [G] [H] [ILV] [M] [F] [Y] [W] [P] [STA].
+  \item Smaller index sizes --- less memory usage
+  \item Greater sensitivity --- seed more likely to match
+  \item More likelihood of useless extensions --- only the seed matched
+  \end{itemize}
+  {\centering
+    \includegraphics[height=0.5\textheight,width=0.8\textwidth,keepaspectratio]{figures/reduced_alphabet.png}
+    
+    \cite{Murphy.ea2000:Simplifiedaminoacidalphabets}
+  }
+  
+\end{frame}
 
-\section{Diamond Methodology}
-\begin{frame}{Diamond Methodology}
+\begin{frame}{Spaced Seeds with Specific Seed Shape}
   \begin{itemize}
-  \item 
-  \end{itemize}    
+  \item Spaced seeds are longer seeds in which only a subset of the
+    positions are used
+  \item For example, if
+    \begin{itemize}
+    \item the sequence was ABCDEFGHI
+    \item the seed shape was
+      11100010
+    \item then you would query into the index with ABCG
+    \end{itemize}
+  \item Originally presented in PatternHunter\cite{Ma.ea2002:PatternHunterfasterandmore}
+  \item Why is this better than consecutive seeds?
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Consecutive Seeds vs Spaced Seeds}
+  \begin{itemize}
+  \item Target Sequence: ABCDEFGHIJK
+  \item Sequenced Sequence: ABC\textcolor{red}{Z}EF\textcolor{red}{Y}HI\textcolor{red}{X}K
+  \item Seed Shape: 11100010 (4) and Consecutive: 1111 (4)
+  \end{itemize}
+% ABCZEFGHIYK
+% ABCDEFGHIJK
+% 11000010---
+% -11000010--
+% --11000010-
+% ---11000010
+
+  \begin{columns}
+    \column{0.6\textwidth}
+\begin{block}{Pathological example}
+  \begin{tabular}{c c c}
+    Shift & Spaced & Consecutive \\
+    0     & ABCF=ABCF & ABCD≠ABCZ \\
+    1     & BCDG≠BCZY & BCDE≠BCZE \\
+    2     & CDEH≠CZEH & CDEF≠CZEF \\
+    3     & DEFI≠ZEFI & DEFG≠ZEFY \\
+    4     & EFGJ≠EFYW & EFGH≠EFYH \\
+    5     & FGHK≠FYHK & FGHI≠FYHI \\
+    6     &           & GHIJ≠YHIW \\
+    7     &           & HIJK≠HIWK \\
+  \end{tabular}
+\end{block}
+  \column{0.4\textwidth}
+  \begin{itemize}
+  \item Spaced seed matches once
+  \item Consecutive seed never matches
+  \item Consecutive seed does more comparisons and may match
+    repeatedly
+  \end{itemize}
+\end{columns}
+\end{frame}
+
+\begin{frame}{Optimal Spaced Seed}
+  \begin{itemize}
+  \item Fewest overlaps with shifted seed
+  \item Longer seeds are better
+  \item Equivalent weight
+  \item Use dynamic programming to calculate optimal seed for given
+    length
+  \end{itemize}
+  \begin{columns}
+    \column{0.6\textwidth}
+  \begin{block}{DIAMOND Seeds (Fast)}
+    \begin{itemize}
+    \item 111101011101111 (12)
+    \item 111011001100101111 (12)
+    \item 1111001001010001001111 (12)
+    \item 111100101000010010010111 (12)
+    \end{itemize}
+  \end{block}
+\end{columns}
+\end{frame}
+
+\begin{frame}{Double Indexing}
+  \begin{itemize}
+  \item Blastx indexes the database
+  \item Blastx runs the queries in input order
+  \item DIAMOND indexes both the database and the queries
+  \item DIAMOND runs queries in index order
+  \item Why is this faster?
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Double Indexing: Why it's faster}
+  \begin{itemize}
+  \item Cache architecture
+    \begin{itemize}
+    \item On CPU Cache -- L1,L2
+    \item Shared CPU Cache L3
+    \item Much faster than main memory
+    \end{itemize}
+  \item Each cache miss must hit main memory (must hit northbridge,
+    which has significantly more latency than main cache, and takes
+    hundreds of cycles)
+  \item Dictionary Example: Is it faster to look up 
+    \begin{itemize}
+    \item “apple”, “xylophone”, “appliance”, “xylem”
+    \item or “apple”, “appliance”, “xylem”, “xylophone”?
+    \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\section{Usage}
+
+\begin{frame}{DIAMOND Usage}
+  \begin{itemize}
+  \item Make the diamond database: 
+    \texttt{diamond makedb --in foo.fasta --db foo.dmnd;}
+  \item Run the diamond query: 
+    \texttt{diamond blastx --db foo.diamond --threads 24 --query bar.fasta --daa bar\_diamond.txt}
+  \end{itemize}
 \end{frame}
 
 \section{Comparison}
 
 \subsection{Speed}
 
+\begin{frame}{Speed of DIAMOND}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_a.jpg}
+\end{frame}
+
 \subsection{Accuracy}
 
+\begin{frame}{Accuracy of DIAMOND: Any success}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_b.jpg}
+\end{frame}
+
+\begin{frame}{Accuracy of DIAMOND: Matches blastx}
+  \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_c.jpg}
+\end{frame}
+
 
 \section*{References}