From: Don Armstrong Date: Wed, 19 Oct 2016 20:08:32 +0000 (-0700) Subject: fix simons typo X-Git-Tag: hpcbio_presentation_oct20~3 X-Git-Url: https://git.donarmstrong.com/?p=presentations%2Fgenome_diversity_oct_2016.git;a=commitdiff_plain;h=58f0c01047b0ccf9ae37695e2c78dc10c8b76506 fix simons typo --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3dc4cbf --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +auto +figure +*.aux +*.bbl +*.bcf +*.blg +*.fdb_latexmk +*.fls +*.log +*.out +simons_genome_diversity_oct_2016.pdf +simons_genome_diversity_oct_2016.tex +*.run.xml +*.rip +cache +*.lof +*.lot +*.toc +*.fff +*.ttt +references.bib +*.nav +*.snm +_minted* +*.pyg +*.vrb diff --git a/Makefile b/Makefile index a655bfc..b4f3840 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ #!/usr/bin/make -f -all: simmons_genome_diversity_oct_2016.pdf +all: simons_genome_diversity_oct_2016.pdf R ?= R @@ -15,8 +15,8 @@ R ?= R %.tex: %.Rnw $(R) --encoding=utf-8 -e "library('knitr'); knit('$<')" -simmons_genome_diversity_oct_2016.pdf: \ -simmons_genome_diversity_oct_2016.tex genome_diversity_paper figures +simons_genome_diversity_oct_2016.pdf: \ +simons_genome_diversity_oct_2016.tex genome_diversity_paper figures %.pdf: %.tex $(wildcard *.bib) $(wildcard *.tex) latexmk -f -pdf -pdflatex='xelatex -shell-escape -8bit -interaction=nonstopmode %O %S' -bibtex -use-make $< diff --git a/simmons_genome_diversity_oct_2016.Rnw b/simmons_genome_diversity_oct_2016.Rnw deleted file mode 100644 index da88aec..0000000 --- a/simmons_genome_diversity_oct_2016.Rnw +++ /dev/null @@ -1,277 +0,0 @@ -\documentclass[ignorenonframetext]{beamer} -\usepackage{fontspec} -\setmainfont{FreeSerif} -\setsansfont{FreeSans} -\setmonofont{FreeMono} -\usepackage{url} -\usepackage{fancyhdr} -\usepackage{graphicx} -\usepackage[bf]{caption} -\usepackage{rotating} -\usepackage{wrapfig} -\usepackage{fancybox} -\usepackage{booktabs} -% \usepackage{multirow} -\usepackage{acronym} -\usepackage{qrcode} -\usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex} -\addbibresource{references.bib} -% \usepackage[nomargin,inline,draft]{fixme} -% \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}} -% \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref} -\usepackage{texshade} -\usepackage{tikz} -\usepackage{nameref} -\usepackage{zref-xr,zref-user} -\renewcommand*{\bibfont}{\tiny} - -% The textpos package is necessary to position textblocks at arbitary -% places on the page. Use showboxes option to show outlines of textboxes. -% \usepackage[absolute]{textpos} -\usepackage[absolute,overlay]{textpos} -\usepackage{mathtools,cancel} - -\renewcommand{\CancelColor}{\color{red}} %change cancel color to red - -\usepackage{multirow} -\usepackage{array} - -\usepackage{minted} -\usepackage{tcolorbox} -\usepackage{etoolbox} -\BeforeBeginEnvironment{minted}{\begin{tcolorbox}}% -\AfterEndEnvironment{minted}{\end{tcolorbox}}% - -\mode{ - \usetheme{CambridgeUS} - \usecolortheme{crane} - % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html - \definecolor{ilboldblue}{HTML}{002058} - \definecolor{ilboldorange}{HTML}{E87722} - \definecolor{ilblue}{HTML}{606EB2} - \definecolor{ilorange}{HTML}{D45D00} - \logo{\begin{tikzpicture}% Pale figure - {\node[opacity=0.1]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}% - };}% - \end{tikzpicture}} -} - -% remove navigation symbols -\setbeamertemplate{navigation symbols}{} - -\title[Ancestry]{Simmons Genome Diversity} -\author[Don Armstrong]{Don L. Armstrong} -\institute[IGB]{Institute for Genomic Biology, Computing Genomes - for Reproductive Health, University of Illinois, Urbana-Champaign} - -\begin{document} - -<>= -opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio") -#opts_chunk$set(cache=TRUE, autodep=TRUE) -options(device = function(file, width = 8, height = 7, ...) { - cairo_pdf(tempfile(), width = width, height = height, ...) -}) -options(digits=2) -library("data.table") -library("ggplot2") -library("reshape2") -library("grid") -library("xtable") - -@ - -\IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png} - - \url{https://xkcd.com/1706/}}} - -\frame[plain]{\titlepage - \begin{center} - Code and slides are here: - - \qrcode[padding]{http://dla2.us/p/genomdiv2016} - - \url{http://dla2.us/p/genomdiv2016} - \end{center} -} - - -\frame[plain]{ - \begin{center} - \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/title.png} - \end{center} -} - -\begin{frame}{Sampling} - \begin{itemize} - \item 142 populations from Africa, America, Oceania, South Asia, - East Asia, and West Eurasia (mostly indigenous) - \item 300 samples sequenced at 34-83 fold coverage by Illumina - \item Aligned using BWA-MEM - \item Genotyped using special version of GATK and Fermikit - \item Data available in EBI ($n=279$, PRJEB9586) and dbGAP ($n=21$, ?) - \end{itemize} -\end{frame} - - -\begin{frame}[fragile]{Alignment Pipeline} - \begin{itemize} - \item Aligned to the “decoy” version of the human reference - (hs37d5); supposedly improves alignment in misassembled regions or - regions with CNVs? - \item PCR-free data, though they marked optical duplicates marked - using samblaster - \end{itemize} -\begin{minted}{bash} -./htscmd bamshuf -Oun128 in.bam tmp-pre \ -| ./htscmd bam2fq -as aln-se.fq.gz - \ -| ./trimadap \ -| ./bwa mem -pt8 hs37d5.fa - \ -| ./samblaster \ -| samtools view -uS - \ -| samtools sort -@4 -m512M - out-pre -\end{minted} -\end{frame} - -\begin{frame}[fragile]{Genotyping} - \begin{itemize} - \item Reference-bias; novel variants, GATK assumes reference is more - likely which may not be the case. Use prior of - $(0.4995,0.001,0.4995)$ instead of default - $(0.9985,0.001,0.0005)$. - \begin{itemize} - \item Unclear what the effect of this change is on the calling - \item Maybe worth thinking about? - \end{itemize} - \item Also used Fermikit; apparently has comparable call rates to - GATK and platypus - \end{itemize} -\begin{minted}{bash} - java -Xmx2g -jar GenomeAnalysisTK.jar \ - -T UnifiedGenotyper -I srt.aln.bam \ - -L CHR_ID -R hs37d5.fa -dcov 600 -glm SNP \ - -out_mode EMIT_ALL_SITES -stand_call_conf 5.0 \ - -stand_emit_conf 5.0 -inputPrior 0.0010 \ - -inputPrior 0.4995 -D dbsnp_138.b37.vcf \ - -o CHR_ID.vcf -A GCContent -A BaseCounts -\end{minted} -\end{frame} - -\begin{frame}{Fermikit vs Platypus vs GATK} - \begin{columns} - \column{0.5\textwidth} - \includegraphics[width=\textwidth,height=0.7\textheight,keepaspectratio]{genome_diversity_paper/supplemental-014.png} - \column{0.5\textwidth} - FermiKit and Platypus call 3.17M more sites than GATK, but unclear - whether those are real sites or not; they go into this in much more - detail than I've digested yet. -\end{columns} -\end{frame} - -\begin{frame}{Relatedness of Populations} - \begin{columns} - \column{0.7\textwidth} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig1a_1.png} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig1a_2.png} - \column{0.3\textwidth} - \begin{itemize} - \item Neighbor joining tree based on pairwise divergence per nucleotide - \item Deepest splits are in African populations - \end{itemize} - \end{columns} -\end{frame} - -\begin{frame}{PCA and Relatedness} - \begin{center} - \includegraphics[width=\textwidth,height=0.7\textheight,keepaspectratio]{genome_diversity_paper/fig_ed4.png} - \end{center} - \begin{itemize} - \item Greatest variation seen in the African populations (orange) - \item Other populations are much more similar to eachother in - general - \item Hapmap likely under-measured variation in Africa - \end{itemize} -\end{frame} - -\begin{frame} - \begin{center} - \includegraphics[width=\textwidth,height=0.6\textheight,keepaspectratio]{genome_diversity_paper/fig1b.png} - \end{center} - \begin{itemize} - \item Pygmy populations have lower X heterozygosity than other African populations - \item Seen even after removing the third of X which is subject to selection - \item Suggests that it's driven by demographic history, and the - reduced diversity is due to male-driven admixture (also in non-Africans) - \end{itemize} -\end{frame} - -\begin{frame}{Neanderthal Ancestry} - \begin{center} - \includegraphics[width=\textwidth,height=0.6\textheight,keepaspectratio]{genome_diversity_paper/fig1c.png} - \end{center} - \begin{itemize} - \item No populations studied have a higher Neanderthal ancestry than - East Asians - \end{itemize} -\end{frame} - -\begin{frame}{Denisovan Ancestry} - \begin{center} - \includegraphics[width=\textwidth,height=0.6\textheight,keepaspectratio]{genome_diversity_paper/fig1d.png} - \end{center} - \begin{itemize} - \item Oceanian groups have as much as 5\% Denisovan ancestry - \item Eurasian differences in ancestry; some South Asians may have - higher Denisovan than other Eurasians - \end{itemize} -\end{frame} -g - -\begin{frame}{Variation missed by hapmap} - \begin{center} - \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig_ed1.png} - \end{center} - \begin{itemize} - \item Hapmap is missing up to 8\% of the heterozygous sites in parts - of Africa - \end{itemize} -\end{frame} - -\begin{frame}{Cross-coalescence rate} - \begin{center} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2a.png} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2b.png} - \end{center} -\end{frame} - -\begin{frame}{Cross-coalescence rate} - \begin{center} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2c.png} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2d.png} - \end{center} -\end{frame} - -\begin{frame}{Effective Population Size} - \begin{center} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2e.png} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2f.png} - \end{center} -\end{frame} - -\begin{frame}{Best-fitting admixture Graph} - \begin{center} - \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig3.png} - \end{center} -\end{frame} - - -\section*{References} - -\begin{frame}[plain]{References} - \begin{center} - \mbox{}\vspace{-\baselineskip} - \printbibliography[heading=none] - \end{center} -\end{frame} - -\end{document} diff --git a/simons_genome_diversity_oct_2016.Rnw b/simons_genome_diversity_oct_2016.Rnw new file mode 100644 index 0000000..da88aec --- /dev/null +++ b/simons_genome_diversity_oct_2016.Rnw @@ -0,0 +1,277 @@ +\documentclass[ignorenonframetext]{beamer} +\usepackage{fontspec} +\setmainfont{FreeSerif} +\setsansfont{FreeSans} +\setmonofont{FreeMono} +\usepackage{url} +\usepackage{fancyhdr} +\usepackage{graphicx} +\usepackage[bf]{caption} +\usepackage{rotating} +\usepackage{wrapfig} +\usepackage{fancybox} +\usepackage{booktabs} +% \usepackage{multirow} +\usepackage{acronym} +\usepackage{qrcode} +\usepackage[backend=biber,natbib=true,hyperref=true,style=nature]{biblatex} +\addbibresource{references.bib} +% \usepackage[nomargin,inline,draft]{fixme} +% \newcommand{\DLA}[1]{\textcolor{red}{\fxnote{DLA: #1}}} +% \usepackage[hyperfigures,bookmarks,colorlinks,citecolor=black,filecolor=black,linkcolor=black,urlcolor=black]{hyperref} +\usepackage{texshade} +\usepackage{tikz} +\usepackage{nameref} +\usepackage{zref-xr,zref-user} +\renewcommand*{\bibfont}{\tiny} + +% The textpos package is necessary to position textblocks at arbitary +% places on the page. Use showboxes option to show outlines of textboxes. +% \usepackage[absolute]{textpos} +\usepackage[absolute,overlay]{textpos} +\usepackage{mathtools,cancel} + +\renewcommand{\CancelColor}{\color{red}} %change cancel color to red + +\usepackage{multirow} +\usepackage{array} + +\usepackage{minted} +\usepackage{tcolorbox} +\usepackage{etoolbox} +\BeforeBeginEnvironment{minted}{\begin{tcolorbox}}% +\AfterEndEnvironment{minted}{\end{tcolorbox}}% + +\mode{ + \usetheme{CambridgeUS} + \usecolortheme{crane} + % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html + \definecolor{ilboldblue}{HTML}{002058} + \definecolor{ilboldorange}{HTML}{E87722} + \definecolor{ilblue}{HTML}{606EB2} + \definecolor{ilorange}{HTML}{D45D00} + \logo{\begin{tikzpicture}% Pale figure + {\node[opacity=0.1]{\IfFileExists{figures/uofi_mark.pdf}{\includegraphics[width=2cm,height=1cm,keepaspectratio]{figures/uofi_mark}}{}% + };}% + \end{tikzpicture}} +} + +% remove navigation symbols +\setbeamertemplate{navigation symbols}{} + +\title[Ancestry]{Simmons Genome Diversity} +\author[Don Armstrong]{Don L. Armstrong} +\institute[IGB]{Institute for Genomic Biology, Computing Genomes + for Reproductive Health, University of Illinois, Urbana-Champaign} + +\begin{document} + +<>= +opts_chunk$set(dev="cairo_pdf",out.width="\\textwidth",out.height="0.8\\textheight",out.extra="keepaspectratio") +#opts_chunk$set(cache=TRUE, autodep=TRUE) +options(device = function(file, width = 8, height = 7, ...) { + cairo_pdf(tempfile(), width = width, height = height, ...) +}) +options(digits=2) +library("data.table") +library("ggplot2") +library("reshape2") +library("grid") +library("xtable") + +@ + +\IfFileExists{figures/relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{figures/relevant_xkcd.png} + + \url{https://xkcd.com/1706/}}} + +\frame[plain]{\titlepage + \begin{center} + Code and slides are here: + + \qrcode[padding]{http://dla2.us/p/genomdiv2016} + + \url{http://dla2.us/p/genomdiv2016} + \end{center} +} + + +\frame[plain]{ + \begin{center} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/title.png} + \end{center} +} + +\begin{frame}{Sampling} + \begin{itemize} + \item 142 populations from Africa, America, Oceania, South Asia, + East Asia, and West Eurasia (mostly indigenous) + \item 300 samples sequenced at 34-83 fold coverage by Illumina + \item Aligned using BWA-MEM + \item Genotyped using special version of GATK and Fermikit + \item Data available in EBI ($n=279$, PRJEB9586) and dbGAP ($n=21$, ?) + \end{itemize} +\end{frame} + + +\begin{frame}[fragile]{Alignment Pipeline} + \begin{itemize} + \item Aligned to the “decoy” version of the human reference + (hs37d5); supposedly improves alignment in misassembled regions or + regions with CNVs? + \item PCR-free data, though they marked optical duplicates marked + using samblaster + \end{itemize} +\begin{minted}{bash} +./htscmd bamshuf -Oun128 in.bam tmp-pre \ +| ./htscmd bam2fq -as aln-se.fq.gz - \ +| ./trimadap \ +| ./bwa mem -pt8 hs37d5.fa - \ +| ./samblaster \ +| samtools view -uS - \ +| samtools sort -@4 -m512M - out-pre +\end{minted} +\end{frame} + +\begin{frame}[fragile]{Genotyping} + \begin{itemize} + \item Reference-bias; novel variants, GATK assumes reference is more + likely which may not be the case. Use prior of + $(0.4995,0.001,0.4995)$ instead of default + $(0.9985,0.001,0.0005)$. + \begin{itemize} + \item Unclear what the effect of this change is on the calling + \item Maybe worth thinking about? + \end{itemize} + \item Also used Fermikit; apparently has comparable call rates to + GATK and platypus + \end{itemize} +\begin{minted}{bash} + java -Xmx2g -jar GenomeAnalysisTK.jar \ + -T UnifiedGenotyper -I srt.aln.bam \ + -L CHR_ID -R hs37d5.fa -dcov 600 -glm SNP \ + -out_mode EMIT_ALL_SITES -stand_call_conf 5.0 \ + -stand_emit_conf 5.0 -inputPrior 0.0010 \ + -inputPrior 0.4995 -D dbsnp_138.b37.vcf \ + -o CHR_ID.vcf -A GCContent -A BaseCounts +\end{minted} +\end{frame} + +\begin{frame}{Fermikit vs Platypus vs GATK} + \begin{columns} + \column{0.5\textwidth} + \includegraphics[width=\textwidth,height=0.7\textheight,keepaspectratio]{genome_diversity_paper/supplemental-014.png} + \column{0.5\textwidth} + FermiKit and Platypus call 3.17M more sites than GATK, but unclear + whether those are real sites or not; they go into this in much more + detail than I've digested yet. +\end{columns} +\end{frame} + +\begin{frame}{Relatedness of Populations} + \begin{columns} + \column{0.7\textwidth} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig1a_1.png} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig1a_2.png} + \column{0.3\textwidth} + \begin{itemize} + \item Neighbor joining tree based on pairwise divergence per nucleotide + \item Deepest splits are in African populations + \end{itemize} + \end{columns} +\end{frame} + +\begin{frame}{PCA and Relatedness} + \begin{center} + \includegraphics[width=\textwidth,height=0.7\textheight,keepaspectratio]{genome_diversity_paper/fig_ed4.png} + \end{center} + \begin{itemize} + \item Greatest variation seen in the African populations (orange) + \item Other populations are much more similar to eachother in + general + \item Hapmap likely under-measured variation in Africa + \end{itemize} +\end{frame} + +\begin{frame} + \begin{center} + \includegraphics[width=\textwidth,height=0.6\textheight,keepaspectratio]{genome_diversity_paper/fig1b.png} + \end{center} + \begin{itemize} + \item Pygmy populations have lower X heterozygosity than other African populations + \item Seen even after removing the third of X which is subject to selection + \item Suggests that it's driven by demographic history, and the + reduced diversity is due to male-driven admixture (also in non-Africans) + \end{itemize} +\end{frame} + +\begin{frame}{Neanderthal Ancestry} + \begin{center} + \includegraphics[width=\textwidth,height=0.6\textheight,keepaspectratio]{genome_diversity_paper/fig1c.png} + \end{center} + \begin{itemize} + \item No populations studied have a higher Neanderthal ancestry than + East Asians + \end{itemize} +\end{frame} + +\begin{frame}{Denisovan Ancestry} + \begin{center} + \includegraphics[width=\textwidth,height=0.6\textheight,keepaspectratio]{genome_diversity_paper/fig1d.png} + \end{center} + \begin{itemize} + \item Oceanian groups have as much as 5\% Denisovan ancestry + \item Eurasian differences in ancestry; some South Asians may have + higher Denisovan than other Eurasians + \end{itemize} +\end{frame} +g + +\begin{frame}{Variation missed by hapmap} + \begin{center} + \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig_ed1.png} + \end{center} + \begin{itemize} + \item Hapmap is missing up to 8\% of the heterozygous sites in parts + of Africa + \end{itemize} +\end{frame} + +\begin{frame}{Cross-coalescence rate} + \begin{center} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2a.png} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2b.png} + \end{center} +\end{frame} + +\begin{frame}{Cross-coalescence rate} + \begin{center} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2c.png} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2d.png} + \end{center} +\end{frame} + +\begin{frame}{Effective Population Size} + \begin{center} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2e.png} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig2f.png} + \end{center} +\end{frame} + +\begin{frame}{Best-fitting admixture Graph} + \begin{center} + \includegraphics[width=0.5\textwidth,height=0.8\textheight,keepaspectratio]{genome_diversity_paper/fig3.png} + \end{center} +\end{frame} + + +\section*{References} + +\begin{frame}[plain]{References} + \begin{center} + \mbox{}\vspace{-\baselineskip} + \printbibliography[heading=none] + \end{center} +\end{frame} + +\end{document}