\documentclass[ignorenonframetext]{beamer} \usepackage[Symbols,MiscellaneousSymbols]{ucharclasses} \usepackage{fontspec} % \usepackage{bidi} \setmainfont{FreeSerif} \setsansfont{FreeSans} \setmonofont{FreeMono} \usepackage{array} \usepackage{fancyref} \usepackage{booktabs} \usepackage{threeparttable} \usepackage[backend=biber,natbib=true,hyperref=true,style=numeric-comp]{biblatex} \bibliography{references} \usepackage[nomargin,inline,draft]{fixme} \usepackage{texshade} \usepackage{tikz} \usepackage{nameref} \usepackage{zref-xr,zref-user} \usepackage{metalogo} \IfFileExists{upquote.sty}{\usepackage{upquote}}{} \mode
{ \usepackage[x11names,svgnames,usenames,dvipsnames]{xcolor} \usepackage[noxcolor]{beamerarticle} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage[bf]{caption} \usepackage{rotating} \usepackage{setspace} \usepackage{acronym} \usepackage{dcolumn} \usepackage{adjustbox} \usepackage{longtable} \usepackage{geometry} \usepackage{pdflscape} \usepackage[hyperfigures,bookmarks,colorlinks]{hyperref} \oddsidemargin 0.0in \textwidth 6.5in \raggedbottom \clubpenalty = 10000 \widowpenalty = 10000 \pagestyle{fancy} } \usepackage{minted} \usepackage{tcolorbox} \usepackage{etoolbox} \BeforeBeginEnvironment{minted}{\begin{tcolorbox}}% \AfterEndEnvironment{minted}{\end{tcolorbox}}% \mode{ \usetheme{CambridgeUS} % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html \definecolor{ilboldblue}{HTML}{002058} \definecolor{ilboldorange}{HTML}{E87722} \definecolor{ilblue}{HTML}{606EB2} \definecolor{ilorange}{HTML}{D45D00} \setbeamercolor{alerted text}{fg=ilboldblue} \setbeamercolor*{palette primary}{fg=ilblue,bg=ilorange} \setbeamercolor*{palette secondary}{fg=ilblue!20!white,bg=ilorange} \setbeamercolor*{palette tertiary}{bg=ilblue,fg=ilorange} \setbeamercolor*{palette quaternary}{fg=ilblue,bg=ilorange} \setbeamercolor*{sidebar}{fg=ilorange,bg=ilboldblue} \setbeamercolor*{palette sidebar primary}{fg=ilblue!10!white,bg=ilorange} \setbeamercolor*{palette sidebar secondary}{fg=ilorange} \setbeamercolor*{palette sidebar tertiary}{fg=ilblue} \setbeamercolor*{palette sidebar quaternary}{fg=ilorange} % \setbeamercolor*{titlelike}{parent=palette primary} \setbeamercolor{titlelike}{parent=palette primary,fg=ilboldblue,bg=ilorange} \setbeamercolor{frametitle}{fg=ilboldorange,bg=ilblue!80!white} \setbeamercolor{frametitle right}{fg=ilboldblue,bg=ilorange} \setbeamercolor*{separation line}{} \setbeamercolor*{fine separation line}{} \setbeamercovered{transparent} \logo{\begin{tikzpicture}% Pale figure {\node[opacity=0.7]{\IfFileExists{./logo.pdf}{\includegraphics[height=1.5cm]{logo.pdf}}{}% };}% \end{tikzpicture}} } \title{Using make for science} \author{Don Armstrong} \date{\today} \subject{make for science} \begin{document} \frame[plain]{\titlepage} \mode
{\maketitle} \section{What make was made for} \begin{frame}{What was make originally made to do?} \begin{itemize} \item Compiling and installing software from source \item Replacement of operating system specific compilation and installation shell scripts \item Re-compile when dependencies of the software were modified \end{itemize} \end{frame} \subsection{Brief history of makes} \begin{frame}{Brief history of make-alikes} \begin{itemize} \item \href{http://pubs.opengroup.org/onlinepubs/009695399/utilities/make.html}{POSIX Make} (standardization of basic features of make) \item \href{http://www.gnu.org/software/make/manual/}{GNU Make} (standard make on Linux and OS X) \item \href{https://www.freebsd.org/cgi/man.cgi?query=make(1)}{BSD Make} (pmake or bmake) \item \href{https://msdn.microsoft.com/en-us/library/dd9y37ha.aspx}{nmake} (Part of visual studio) \item \href{http://plan9.bell-labs.com/sys/doc/mk.html}{Mk} (Plan 9 replacement of make) \end{itemize} \end{frame} \subsection{Other solutions in this problem space} \begin{frame}{Other non-make dependency builders} \begin{itemize} \item Ant (popular for java software) \item Cabal (popular for Haskell) \item Maven (also java) \item Rake (ruby build took) \item Gradle (Rake DSL) \item Leiningen (Clojure) \item Tweaker (task definitions in any language) \item \href{https://en.wikipedia.org/wiki/List_of_build_automation_software}{Wikipedia List of build automation software} \end{itemize} \end{frame} \section{Introduction to Makefiles} \begin{frame}[fragile]{Simple Makefile} \begin{minted}[showtabs]{make} hello_world: echo "hello world" > hello_world \end{minted} \end{frame} \subsection{General Syntax} \begin{frame}[fragile]{Simple Makefile} \begin{minted}[showtabs]{make} TARGETS: PREREQUISITES RECIPE \end{minted} \begin{itemize} \item TARGETS are file names separated by spaces \item PREREQUISITES are file names separated by spaces. \item RECIPE lines start with a tab, are executed by the shell and describe how to make the TARGETS (generally from the PREREQUISITES) \item A TARGET is out of date if it does not exist or if it is older than any of the prerequisites. \end{itemize} \end{frame} \subsection{Variables} \begin{frame}[fragile]{Some Variables} \begin{itemize} \item Two flavors of variables \begin{itemize} \item \mintinline{make}{FOO=bar} -- recursively expanded variables; references to other variables are expanded at the time this variable is expanded \item \mintinline{make}{FOO:=bar} -- simply expanded variables; the value is assigned at the moment the variable is created \end{itemize} \item Variables can come from the environment and can be overridden on the command line: \mintinline{shell}{make FOO=bleargh} or \mintinline{shell}{FOO=blah make}. \item \mintinline{make}{$@} -- target name %$ \item \mintinline{make}{$*} -- current stem %$ \item \mintinline{make}{$^} -- all prerequisites %$ \item \mintinline{make}{$<} -- first prerequisite %$ \item \mintinline{make}{$(FOO)} -- how variables are referenced %$ \end{itemize} \end{frame} \subsection{Functions} \begin{frame}[fragile]{Some Functions} \begin{itemize} \item \mintinline{make}{$(patsubst %.bam,%.sam,foo.sam bar.sam)} %$ -- returns foo.bam bar.bam. \item \mintinline{make}{$(filter-out %.bam,foo.sam bar.bam)} %$ -- returns foo.sam \item \mintinline{make}{$(words foo.sam bar.bam)} %$ -- returns the number of words in its argument (2) \item \mintinline{make}{$(wordlist 1,2,foo.sam bar.bam bleargh.foo)} %$ -- returns the words in its last argument starting with the 1st and ending with the second. \end{itemize} \end{frame} \subsection{Rules} \subsubsection{Default Target} \begin{frame}[fragile]{How does make know what to build?} \begin{minted}[showtabs]{make} first_target: touch $@ second_target: first_target touch $@ \end{minted} \begin{itemize} \item By default, make builds the first target. \item You can specify a specific target to build on the command line (\mintinline{shell}{make first_target}). \item You can change the default target by using the variable \mintinline{make}{.DEFAULT_GOAL := second_target} \end{itemize} \end{frame} \subsubsection{Special Targets} \begin{frame}[fragile]{Special Targets} \begin{minted}[showtabs]{make} .PHONY: clean clean: rm -f first_target second_target \end{minted} \begin{itemize} \item \mintinline{make}{.PHONY} -- any time make considers this target, it is run unconditionally, even if a file exists. \item \mintinline{make}{.ONESHELL} -- when a target is built, all lines will be given to a single invocation of the shell. \item Lots of other special targets which are not described here. \end{itemize} \end{frame} \subsubsection{Pattern Rules} \begin{frame}[fragile]{Special Targets} \begin{minted}[showtabs]{make} %.fasta.gz: %.fasta gzip -dc $< > $@ %.bam: %.sam samtools view -b -o $@ $< \end{minted} \begin{itemize} \item \% is the pattern stem which is accessible by \mintinline{make}{$*} %$ within rules \item The first rule uncompresses fasta files \item The second rule turns sam files into bam files \end{itemize} \end{frame} \section{Examples} \subsection{This Presentation} \begin{frame}[fragile]{How this presentation is made} \inputminted[showtabs]{make}{Makefile} \begin{itemize} \item all is the default target \item Make .tex files from the knitr source. \item The third rule uses latexmk to build the pdf using \XeLaTeX. \end{itemize} \end{frame} \subsection{Calling records from SRA} \begin{frame}{Calling records from SRA: The problem} \begin{itemize} \item ≈200 tissue samples from Roadmap Epigenomics \item No consistent workflow \item Reanalyze them all using STAR and cufflinks \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Downloading} \begin{minted}[showtabs,breaklines]{make} SRX=SRX007165 SRRS=SRR020291 SRR020290 SRR020288 SRR020287 SRR020289 NREADS=1 SRR_FILES=$(patsubst %,%.sra,$(SRRS)) get_srr: $(SRR_FILES) $(SRR_FILES): %.sra: rsync -avP "rsync://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/$(shell echo -n $*|sed 's/\(SRR[0-9][0-9][0-9]\).*/\1/')/$*/$*.sra" $@; \end{minted} %$ \begin{itemize} \item First three lines are actually generated by other code and included \item Download all of the SRA files using rsync \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Dumping fastq} \begin{minted}[showtabs,breaklines]{make} ifeq ($(NREADS),1) FASTQ_FILES:=$(patsubst %,%.fastq.gz,$(SRRS)) else FASTQ_FILES:=$(patsubst %,%_1.fastq.gz,$(SRRS)) $(patsubst %,%_2.fastq.gz,$(SRRS)) endif make_fastq: $(FASTQ_FILES) ifeq ($(NREADS),1) $(FASTQ_FILES): %.fastq.gz: %.sra else %_1.fastq.gz %_2.fastq.gz: %.sra endif $(MODULE) load sratoolkit/2.3.5-2; \ fastq-dump --split-3 --gzip $^; \end{minted} %$ \begin{itemize} \item Call fastq-dump to dump the fastq files \item Handles NREADS of 1 and 2 differently \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Align with STAR} \begin{minted}[showtabs,breaklines]{make} $(SRX)_star.bam: $(MODULE) load STAR/2.4.2a; \ mkdir -p $(SRX)_star; \ STAR --outFileNamePrefix $(SRX)_star/ \ --outSAMtype BAM SortedByCoordinate \ --runThreadN $(CORES) \ --outSAMstrandField intronMotif \ --genomeDir $(STAR_INDEX_DIR) \ --readFilesCommand "gzip -dc" \ --readFilesIn $(TOPHAT_FASTQ_ARGUMENT); ln $(SRX)_star/Aligned.*.bam $@ -s \end{minted} %$ \begin{itemize} \item Call STAR with lots of options to do the alignment \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Call with cufflinks} \begin{minted}[showtabs,breaklines]{make} call: $(SRX)_genes.fpkm_tracking $(SRX)_genes.fpkm_tracking: $(SRX)_star.bam $(BOWTIE_INDEX_DIR)$(GTF) $(MODULE) load cufflinks/2.2.1; \ cufflinks -p $(CORES) -G $(wordlist 2,2,$^) $< for file in genes.fpkm_tracking isoforms.fpkm_tracking skipped.gtf transcripts.gtf; do \ mv $${file} $(SRX)_$${file}; \ done; \end{minted} %$ \begin{itemize} \item Use cufflinks to call \end{itemize} \end{frame} \begin{frame}[fragile]{Run it on biocluster} \begin{minted}{shell} for a in SRX*/Makefile; do (cd $(dirname $a); MAKE_TARGET=call qsub -q default \ -v MAKE_TARGET -S /bin/bash \ -d "$(pwd)" -l "nodes=1:ppn=8,mem=70G" \ ~donarm/uiuc_igb_scripts/run_make ); done; \end{minted} %$ \begin{itemize} \item Use qsub and some shell scripting to run everything \item Uses a special script which calls make with a specific target \end{itemize} \end{frame} \section{Why not make?} \begin{frame}{Why not make?} \begin{itemize} \item Timestamps, not MD5sums \item Complicated workflows \end{itemize} \end{frame} \subsection{Timestamps} \begin{frame}{Dealing with timestamps} \end{frame} \subsection{Complicated Workflows} \begin{frame}{Complicated Workflow} \end{frame} \section{Further Resources} \begin{frame}{Further Resources} \end{frame} \end{document}