move usage around

[diamond_presentation.git] / diamond_presentation_2015.Rnw
diff --git a/diamond_presentation_2015.Rnw b/diamond_presentation_2015.Rnw

index 2ed4cf2d707eeec79f4f755aec091b6a37eda775..d6d307e1c4e780b66e6ebce3737dd8691b750bf8 100644 (file)
--- a/diamond_presentation_2015.Rnw
+++ b/diamond_presentation_2015.Rnw
@@ -28,41 +28,24 @@
  % The textpos package is necessary to position textblocks at arbitary 
  % places on the page.  Use showboxes option to show outlines of textboxes.
  % \usepackage[absolute]{textpos}
  % The textpos package is necessary to position textblocks at arbitary 
  % places on the page.  Use showboxes option to show outlines of textboxes.
  % \usepackage[absolute]{textpos}
-\usepackage[absolute,showboxes]{textpos}
+\usepackage[absolute,overlay]{textpos}
+\usepackage{mathtools,cancel}
+
+\renewcommand{\CancelColor}{\color{red}} %change cancel color to red
  
  \usepackage{multirow}
  \usepackage{array}
  
  \mode<presentation>{ 
    \usetheme{CambridgeUS}
  
  \usepackage{multirow}
  \usepackage{array}
  
  \mode<presentation>{ 
    \usetheme{CambridgeUS}
+  \usecolortheme{crane}
    % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html
    \definecolor{ilboldblue}{HTML}{002058}
    \definecolor{ilboldorange}{HTML}{E87722}
    \definecolor{ilblue}{HTML}{606EB2}
    \definecolor{ilorange}{HTML}{D45D00}
    % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html
    \definecolor{ilboldblue}{HTML}{002058}
    \definecolor{ilboldorange}{HTML}{E87722}
    \definecolor{ilblue}{HTML}{606EB2}
    \definecolor{ilorange}{HTML}{D45D00}
-  \setbeamercolor{alerted text}{fg=ilboldorange}
-  \setbeamercolor*{palette primary}{fg=ilblue,bg=ilorange}
-  \setbeamercolor*{palette secondary}{fg=ilblue!20!white,bg=ilorange}
-  \setbeamercolor*{palette tertiary}{bg=ilblue,fg=ilorange}
-  \setbeamercolor*{palette quaternary}{fg=ilblue,bg=ilorange}
-  
-  \setbeamercolor*{sidebar}{fg=ilorange,bg=ilboldblue}
-  
-  \setbeamercolor*{palette sidebar primary}{fg=ilblue!10!white,bg=ilorange}
-  \setbeamercolor*{palette sidebar secondary}{fg=ilorange}
-  \setbeamercolor*{palette sidebar tertiary}{fg=ilblue}
-  \setbeamercolor*{palette sidebar quaternary}{fg=ilorange}
-  
-  % \setbeamercolor*{titlelike}{parent=palette primary}
-  \setbeamercolor{titlelike}{parent=palette primary,fg=ilboldblue,bg=ilorange}
-  \setbeamercolor{frametitle}{fg=ilboldorange,bg=ilblue!80!white}
-  \setbeamercolor{frametitle right}{fg=ilboldblue,bg=ilorange}
-  
-  \setbeamercolor*{separation line}{}
-  \setbeamercolor*{fine separation line}{}
-  \setbeamercovered{transparent}  
    \logo{\begin{tikzpicture}% Pale figure
    \logo{\begin{tikzpicture}% Pale figure
-      {\node[opacity=0.8]{\IfFileExists{figures/igb_wordmark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/igb_wordmark}}{}%
+      {\node[opacity=0.6]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}%
          };}%
      \end{tikzpicture}}
  }
          };}%
      \end{tikzpicture}}
  }
@@ -93,22 +76,273 @@ library("xtable")
  
      \url{https://xkcd.com/1513/}}}
  
  
      \url{https://xkcd.com/1513/}}}
  
-\frame[plain]{\titlepage}
+\frame[plain]{\titlepage
+  \begin{center}
+    Code and slides are here: 
+    
+    \qrcode[padding]{http://dla2.us/p/dmnd2015}
+    
+    \url{http://dla2.us/p/dmnd2015}
+   \end{center}
+ }
  
  
  
  \section{The Problem}
  
  
  
  \section{The Problem}
+\begin{frame}{The Problem}
+  \begin{itemize}
+  \item mRNA sequences
+  \item No clear reference genome/database
+  \item How to annotate them?
+  \end{itemize}
+  \only<2>{
+    \begin{textblock*}{64mm}(32mm,0.15\textheight)
+      \begin{exampleblock}{No Reference Genome}
+        \begin{center}
+          \includegraphics[height=0.3\textheight,keepaspectratio]{figures/spalax.jpg}
+        \end{center}
+        \begin{itemize}
+        \item Blind mole rat (\textit{Spalax})
+        \item No reference genome
+        \item Close to mouse/rat, but not close enough to use
+          mouse/rat directly
+        \item Placenta sequences; how to annotate them?
+        \end{itemize}
+      \end{exampleblock}
+    \end{textblock*}
+  }
+    \only<3>{
+    \begin{textblock*}{64mm}(32mm,0.25\textheight)
+      \begin{exampleblock}<3>{Environmental Samples}
+        \begin{itemize}
+        \item Permafrost
+        \item Many un-cultured, un-sequenced bacteria
+        \item How to annotate what genes they are expressing?
+        \end{itemize}
+      \end{exampleblock}
+    \end{textblock*}
+    }
+\end{frame}
+
+\begin{frame}{Basic Solution}
+  \begin{itemize}
+  \item Annotate mRNA against proteins in the nr or in a suitable
+    reference proteome
+  \item Six possible translations of nucleotide into amino acid
+  \item Take all possible sub-sequences
+  \item Hash into reference, extend match
+  \item Pick best match
+  \end{itemize}
+\end{frame}
  
  \section{The Previous Contenders}
  
  \section{The Previous Contenders}
+\begin{frame}{Previous Contenders}
+  \begin{itemize}
+  \item Blastx
+  \item Rapsearch
+  \item mBlast
+  \item many, many, more
+  \end{itemize}
+\end{frame}
+
+
+\section{Diamond Advances}
+\begin{frame}{Diamond Methodology Advances}
+  \begin{itemize}
+  \item Seed and Extend
+  \item Reduced Alphabet
+  \item Spaced Seeds with Specific Seed Shape
+  \item Double Indexing
+  \end{itemize}  
+\end{frame}
+
+\begin{frame}{Seed and Extend}
+  \begin{itemize}
+  \item Calculate an index
+  \item Look up matching indices in the database
+  \item Local string alignment using Smith-Waterman
+  \item Looks like blast, right?
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Reduced Alphabet}
+
+  \begin{itemize}
+  \item \colorbox{red}{LVI} \colorbox{red!10!white}{M} \colorbox{yellow}{C} \colorbox{black!50!white}{G} \colorbox{red!50!blue}{\textcolor{white}{STA}} 
+    \colorbox{green}{P} \colorbox{red!50!blue!50!white}{F} \colorbox{red!50!blue!20!white}{Y} \colorbox{black}{\textcolor{white}{W}} \colorbox{blue}{\textcolor{white}{KREDNQ}}
+%-    [KREDQN] [C] [G] [H] [ILV] [M] [F] [Y] [W] [P] [STA].
+  \item Smaller index sizes --- less memory usage
+  \item Greater sensitivity --- seed more likely to match
+  \item More likelihood of useless extensions --- only the seed matched
+  \end{itemize}
+  {\centering
+    \includegraphics[height=0.5\textheight,width=0.8\textwidth,keepaspectratio]{figures/reduced_alphabet.png}
+    
+    \cite{Murphy.ea2000:Simplifiedaminoacidalphabets}
+  }
+  
+\end{frame}
+
+\begin{frame}{Spaced Seeds with Specific Seed Shape}
+  \begin{itemize}
+  \item Spaced seeds are longer seeds in which only a subset of the
+    positions are used
+  \item For example, if
+    \begin{itemize}
+    \item the sequence was ABCDEFGHI
+    \item the seed shape was
+      11100010
+    \item then you would query into the index with ABCG
+    \end{itemize}
+  \item Originally presented in PatternHunter\cite{Ma.ea2002:PatternHunterfasterandmore}
+  \item Why is this better than consecutive seeds?
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Consecutive Seeds vs Spaced Seeds}
+  \begin{itemize}
+  \item Target Sequence: ABCDEFGHIJK
+  \item Sequenced Sequence: ABC\textcolor{red}{Z}EF\textcolor{red}{Y}HI\textcolor{red}{X}K
+  \item Seed Shape: 11100010 (4) and Consecutive: 1111 (4)
+  \end{itemize}
+% ABCZEFGHIYK
+% ABCDEFGHIJK
+% 11000010---
+% -11000010--
+% --11000010-
+% ---11000010
+
+  \begin{columns}
+    \column{0.6\textwidth}
+\begin{block}{Pathological example}
+  \begin{tabular}{c c c}
+    Shift & Spaced & Consecutive \\
+    0     & ABCF=ABCF & ABCD≠ABCZ \\
+    1     & BCDG≠BCZY & BCDE≠BCZE \\
+    2     & CDEH≠CZEH & CDEF≠CZEF \\
+    3     & DEFI≠ZEFI & DEFG≠ZEFY \\
+    4     & EFGJ≠EFYW & EFGH≠EFYH \\
+    5     & FGHK≠FYHK & FGHI≠FYHI \\
+    6     &           & GHIJ≠YHIW \\
+    7     &           & HIJK≠HIWK \\
+  \end{tabular}
+\end{block}
+  \column{0.4\textwidth}
+  \begin{itemize}
+  \item Spaced seed matches once
+  \item Consecutive seed never matches
+  \item Consecutive seed does more comparisons and may match
+    repeatedly
+  \end{itemize}
+\end{columns}
+\end{frame}
  
  
-\section{Diamond Methodology}
+\begin{frame}{Optimal Spaced Seed}
+  \begin{itemize}
+  \item Fewest overlaps with shifted seed
+  \item Longer seeds are better
+  \item Equivalent weight
+  \item Use dynamic programming to calculate optimal seed for given
+    length
+  \end{itemize}
+  \begin{columns}
+    \column{0.6\textwidth}
+  \begin{block}{DIAMOND Seeds (Fast)}
+    \begin{itemize}
+    \item 111101011101111 (12)
+    \item 111011001100101111 (12)
+    \item 1111001001010001001111 (12)
+    \item 111100101000010010010111 (12)
+    \end{itemize}
+  \end{block}
+\end{columns}
+\end{frame}
+
+\begin{frame}{Double Indexing}
+  \begin{itemize}
+  \item Blastx indexes the database
+  \item Blastx runs the queries in input order
+  \item DIAMOND indexes both the database and the queries
+  \item DIAMOND runs queries in index order
+  \item Why is this faster?
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Double Indexing: Why it's faster}
+  \begin{itemize}
+  \item Cache architecture
+    \begin{itemize}
+    \item On CPU Cache -- L1,L2
+    \item Shared CPU Cache L3
+    \item Much faster than main memory
+    \end{itemize}
+  \item Each cache miss must hit main memory (must hit northbridge,
+    which has significantly more latency than main cache, and takes
+    hundreds of cycles)
+  \item Dictionary Example: Is it faster to look up 
+    \begin{itemize}
+    \item “apple”, “xylophone”, “appliance”, “xylem”
+    \item or “apple”, “appliance”, “xylem”, “xylophone”?
+    \end{itemize}
+  \end{itemize}
+\end{frame}
  
  \section{Comparison}
  
  \subsection{Speed}
  
  
  \section{Comparison}
  
  \subsection{Speed}
  
+\begin{frame}{Speed of DIAMOND}
+  \begin{center}
+    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_a.jpg}
+  \end{center}
+\end{frame}
+
  \subsection{Accuracy}
  
  \subsection{Accuracy}
  
+\begin{frame}{Accuracy of DIAMOND: Any success}
+  \begin{center}
+    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_b.jpg}
+  \end{center}
+\end{frame}
+
+\begin{frame}{Accuracy of DIAMOND: Matches blastx}
+  \begin{center}
+    \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{diamond_paper_figures/diamond_F1_c.jpg}
+  \end{center}
+\end{frame}
+
+\section{Usage}
+
+\begin{frame}{DIAMOND Usage}
+  \begin{itemize}
+  \item Make the diamond database: 
+    \texttt{diamond makedb --in foo.fasta --db foo.dmnd;}
+  \item Run the diamond query: 
+    \texttt{diamond blastx --db foo.diamond --threads 24 --query bar.fasta --daa bar\_diamond.txt}
+  \end{itemize}
+\end{frame}
+
+
+\subsection{Output}
+\begin{frame}{DIAMOND Output}
+  \begin{itemize}
+  \item Standard BLASTx output
+  \item Equivalent evalues and bit scores
+  \item An example from \textit{Spalax} (the top two proteins are
+    isoforms):
+  \end{itemize}
+  \tiny
+  \begin{tabular}{c c c c c c c c c c c c}
+    query & match & \% ident & length & \# mm & gap & qst & qstp & sstart & sstop & evalue & score \\
+c18\_g1\_i1 & ...065786 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 992 & 1352 & 5.7e-203 & 704.9 \\
+c18\_g1\_i1 & ...081540 & 94.5 & 361 & 20 & 0 & 2 & 1084 & 940 & 1300 & 5.7e-203 & 704.9 \\
+c18\_g1\_i1 & ...142322 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 944 & 1300 & 5.9e-99 & 359.4 \\
+c18\_g1\_i1 & ...039711 & 48.8 & 361 & 178 & 3 & 5 & 1078 & 936 & 1292 & 5.9e-99 & 359.4 \\
+c18\_g1\_i1 & ...141518 & 43.0 & 230 & 124 & 3 & 5 & 685 & 936 & 1161 & 1.7e-50 & 198.4 \\
+\end{tabular}
+\end{frame}
+
+
  
  \section*{References}
  
  
  \section*{References}