\documentclass[ignorenonframetext]{beamer} \usepackage[Symbols,MiscellaneousSymbols]{ucharclasses} \usepackage{fontspec} % \usepackage{bidi} \setmainfont{FreeSerif} \setsansfont{FreeSans} \setmonofont{FreeMono} \usepackage{array} \usepackage{fancyref} \usepackage{booktabs} \usepackage{threeparttable} \usepackage[backend=biber,natbib=true,hyperref=true,style=numeric-comp]{biblatex} \bibliography{references} \usepackage[nomargin,inline,draft]{fixme} \usepackage{texshade} \usepackage{tikz} \usepackage{nameref} \usepackage{zref-xr,zref-user} \usepackage{metalogo} \IfFileExists{upquote.sty}{\usepackage{upquote}}{} \mode
{ \usepackage[x11names,svgnames,usenames,dvipsnames]{xcolor} \usepackage[noxcolor]{beamerarticle} \usepackage{fancyhdr} \usepackage{graphicx} \usepackage[bf]{caption} \usepackage{rotating} \usepackage{setspace} \usepackage{acronym} \usepackage{dcolumn} \usepackage{adjustbox} \usepackage{longtable} \usepackage{geometry} \usepackage{pdflscape} \usepackage[hyperfigures,bookmarks,colorlinks]{hyperref} \oddsidemargin 0.0in \textwidth 6.5in \raggedbottom \clubpenalty = 10000 \widowpenalty = 10000 \pagestyle{fancy} } \usepackage{minted} \usepackage{tcolorbox} \usepackage{etoolbox} \BeforeBeginEnvironment{minted}{\begin{tcolorbox}}% \AfterEndEnvironment{minted}{\end{tcolorbox}}% \mode{ \usetheme{CambridgeUS} % http://identitystandards.illinois.edu/graphicstandardsmanual/generalguidelines/colors.html \definecolor{ilboldblue}{HTML}{002058} \definecolor{ilboldorange}{HTML}{E87722} \definecolor{ilblue}{HTML}{606EB2} \definecolor{ilorange}{HTML}{D45D00} \setbeamercolor{alerted text}{fg=ilboldblue} \setbeamercolor*{palette primary}{fg=ilblue,bg=ilorange} \setbeamercolor*{palette secondary}{fg=ilblue!20!white,bg=ilorange} \setbeamercolor*{palette tertiary}{bg=ilblue,fg=ilorange} \setbeamercolor*{palette quaternary}{fg=ilblue,bg=ilorange} \setbeamercolor*{sidebar}{fg=ilorange,bg=ilboldblue} \setbeamercolor*{palette sidebar primary}{fg=ilblue!10!white,bg=ilorange} \setbeamercolor*{palette sidebar secondary}{fg=ilorange} \setbeamercolor*{palette sidebar tertiary}{fg=ilblue} \setbeamercolor*{palette sidebar quaternary}{fg=ilorange} % \setbeamercolor*{titlelike}{parent=palette primary} \setbeamercolor{titlelike}{parent=palette primary,fg=ilboldblue,bg=ilorange} \setbeamercolor{frametitle}{fg=ilboldorange,bg=ilblue!80!white} \setbeamercolor{frametitle right}{fg=ilboldblue,bg=ilorange} \setbeamercolor*{separation line}{} \setbeamercolor*{fine separation line}{} \setbeamercovered{transparent} \logo{\begin{tikzpicture}% Pale figure {\node[opacity=0.7]{\IfFileExists{./logo.pdf}{\includegraphics[height=1cm,width=1cm,keepaspectratio]{logo.pdf}}{}% };}% \end{tikzpicture}} } \title{Using make for science} \author{Don Armstrong} \date{\today} \subject{make for science} \begin{document} \IfFileExists{./relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{./relevant_xkcd.png}}} \frame[plain]{\titlepage} \mode
{\maketitle} \section{What make was made for} \begin{frame}{What was make originally made to do?} \begin{itemize} \item Compiling and installing software from source \item Replacement of operating system specific compilation and installation shell scripts \item Re-compile when dependencies of the software were modified \end{itemize} \end{frame} \subsection{Brief history of makes} \begin{frame}{Brief history of make-alikes} \begin{itemize} \item \href{http://pubs.opengroup.org/onlinepubs/009695399/utilities/make.html}{POSIX Make} (standardization of basic features of make) \item \href{http://www.gnu.org/software/make/manual/}{GNU Make} (standard make on Linux and OS X) \item \href{https://www.freebsd.org/cgi/man.cgi?query=make(1)}{BSD Make} (pmake or bmake) \item \href{https://msdn.microsoft.com/en-us/library/dd9y37ha.aspx}{nmake} (Part of visual studio) \item \href{http://plan9.bell-labs.com/sys/doc/mk.html}{Mk} (Plan 9 replacement of make) \end{itemize} \end{frame} \subsection{Other solutions in this problem space} \begin{frame}{Other non-make dependency builders} \begin{itemize} \item Ant (popular for java software) \item Cabal (popular for Haskell) \item Maven (also java) \item Rake (ruby build took) \item Gradle (Rake DSL) \item Leiningen (Clojure) \item Tweaker (task definitions in any language) \item Ruffus (Pipeline library for python) \item \href{https://en.wikipedia.org/wiki/List_of_build_automation_software}{Wikipedia List of build automation software} \end{itemize} \end{frame} \subsection{Why use GNU make?} \begin{frame}{Why use GNU make?} \begin{itemize} \item Ubiquitous -- any machine which you can run command line tools on has GNU make available. \item Large community -- lots of people use GNU make. It's not going to go away tomorrow. \item Simple rules -- all of the rules are in a simple text file which is easily edited and version controlled \item Reasonable debugging -- you can see the commands that make is going to run fairly easily: \mintinline{shell}{make -n target;} \item Parallel -- make can make targets in parallel: \mintinline{shell}{make -j8 all;} \item Language agnostic -- make doesn't care what language your code is written in \end{itemize} \end{frame} \section{Introduction to Makefiles} \begin{frame}[fragile]{Simple Makefile} \begin{minted}[showtabs]{make} hello_world: echo "hello world" > hello_world \end{minted} \end{frame} \subsection{General Syntax} \begin{frame}[fragile]{Simple Makefile} \begin{minted}[showtabs]{make} TARGETS: PREREQUISITES RECIPE \end{minted} \begin{itemize} \item TARGETS are file names separated by spaces \item PREREQUISITES are file names separated by spaces. \item RECIPE lines start with a tab, are executed by the shell and describe how to make the TARGETS (generally from the PREREQUISITES) \item A TARGET is out of date if it does not exist or if it is older than any of the prerequisites. \end{itemize} \end{frame} \subsection{Variables} \begin{frame}[fragile]{Some Variables} \begin{itemize} \item Two flavors of variables \begin{itemize} \item \mintinline{make}{FOO=bar} -- recursively expanded variables; references to other variables are expanded at the time this variable is expanded \item \mintinline{make}{FOO:=bar} -- simply expanded variables; the value is assigned at the moment the variable is created \end{itemize} \item Variables can come from the environment and can be overridden on the command line: \mintinline{shell}{FOO=blah make} or \mintinline{shell}{make FOO=bleargh}. \item \mintinline{make}{$@} -- target name %$ \item \mintinline{make}{$*} -- current stem %$ \item \mintinline{make}{$^} -- all prerequisites %$ \item \mintinline{make}{$<} -- first prerequisite %$ \item \mintinline{make}{$(FOO)} -- how variables are referenced %$ \end{itemize} \end{frame} \subsection{Functions} \begin{frame}[fragile]{Some Functions} \begin{itemize} \item \mintinline{make}{$(patsubst %.sam,%.bam,foo.sam bar.sam)} %$ -- returns foo.bam bar.bam. \item \mintinline{make}{$(filter-out %.bam,foo.sam bar.bam)} %$ -- returns foo.sam \item \mintinline{make}{$(words foo.sam bar.bam)} %$ -- returns the number of words in its argument (2) \item \mintinline{make}{$(wordlist 1,2,foo.sam bar.bam bleargh.foo)} %$ -- returns the words in its last argument starting with the 1st and ending with the second. \end{itemize} \end{frame} \subsection{Rules} \subsubsection{Default Target} \begin{frame}[fragile]{How does make know what to build?} \begin{minted}[showtabs]{make} first_target: touch $@ second_target: first_target touch $@ \end{minted} \begin{itemize} \item By default, make builds the first target. \item You can specify a specific target to build on the command line (\mintinline{shell}{make first_target}). \item You can change the default target by using the variable \mintinline{make}{.DEFAULT_GOAL := second_target} \end{itemize} \end{frame} \subsubsection{Special Targets} \begin{frame}[fragile]{Special Targets} \begin{minted}[showtabs]{make} .PHONY: clean clean: rm -f first_target second_target \end{minted} \begin{itemize} \item \mintinline{make}{.PHONY} -- any time make considers this target, it is run unconditionally, even if a file exists. \item \mintinline{make}{.ONESHELL} -- when a target is built, all lines will be given to a single invocation of the shell. \item Lots of other special targets which are not described here. \end{itemize} \end{frame} \subsubsection{Pattern Rules} \begin{frame}[fragile]{Special Targets} \begin{minted}[showtabs]{make} %.fasta: %.fasta.gz gzip -dc $< > $@ %.bam: %.sam samtools view -b -o $@ $< \end{minted} \begin{itemize} \item \% is the pattern stem which is accessible by \mintinline{make}{$*} %$ within rules \item The first rule uncompresses fasta files \item The second rule turns sam files into bam files \end{itemize} \end{frame} \section{Examples} \subsection{This Presentation} \begin{frame}[fragile]{How this presentation is made} \inputminted[showtabs,breaklines,firstline=3]{make}{Makefile} \end{frame} \begin{frame}[fragile]{How this presentation is made} \begin{itemize} \item all is the default \item Download the optional relevant\_xkcd.png \item Make .tex files from the knitr source. \item The third rule uses latexmk to build the pdf using \XeLaTeX. \end{itemize} \end{frame} \subsection{Calling records from SRA} \begin{frame}{Calling records from SRA: The problem} \begin{itemize} \item ≈200 tissue samples from Roadmap Epigenomics \item No consistent workflow \item Reanalyze them all using STAR and cufflinks \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Downloading} \begin{minted}[showtabs,breaklines]{make} SRX=SRX007165 SRRS=SRR020291 SRR020290 NREADS=1 SRR_FILES=$(patsubst %,%.sra,$(SRRS)) get_srr: $(SRR_FILES) $(SRR_FILES): %.sra: rsync -avP "rsync://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/$(shell echo -n $*|sed 's/\(SRR[0-9][0-9][0-9]\).*/\1/')/$*/$*.sra" $@; \end{minted} %$ \begin{itemize} \item First three lines are actually generated by other code and included \item Download all of the SRA files using rsync \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Dumping fastq} \begin{minted}[showtabs,breaklines]{make} ifeq ($(NREADS),1) FASTQ_FILES:=$(patsubst %,%.fastq.gz,$(SRRS)) else FASTQ_FILES:=$(patsubst %,%_1.fastq.gz,$(SRRS)) $(patsubst %,%_2.fastq.gz,$(SRRS)) endif make_fastq: $(FASTQ_FILES) \end{minted} %$ \begin{itemize} \item Use ifeq/else/endif to handle paired reads differently from unpaired reads \item FASTQ\_FILES is the full set of fastq files dumped from the SRAs. \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Dumping fastq #2} \begin{minted}[showtabs,breaklines]{make} ifeq ($(NREADS),1) $(FASTQ_FILES): %.fastq.gz: %.sra else %_1.fastq.gz %_2.fastq.gz: %.sra endif $(MODULE) load sratoolkit/2.3.5-2; \ fastq-dump --split-3 --gzip $^; \end{minted} %$ \begin{itemize} \item Handles NREADS of 1 and 2 differently \item Call fastq-dump to dump the fastq files \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Align with STAR} \begin{minted}[showtabs,breaklines]{make} $(SRX)_star.bam: $(MODULE) load STAR/2.4.2a; \ mkdir -p $(SRX)_star; \ STAR --outFileNamePrefix $(SRX)_star/ \ --outSAMtype BAM SortedByCoordinate \ --runThreadN $(CORES) \ --outSAMstrandField intronMotif \ --genomeDir $(STAR_INDEX_DIR) \ --readFilesCommand "gzip -dc" \ --readFilesIn $(TOPHAT_FASTQ_ARGUMENT); ln $(SRX)_star/Aligned.*.bam $@ -s \end{minted} %$ \begin{itemize} \item Call STAR with lots of options to do the alignment \end{itemize} \end{frame} \begin{frame}[fragile]{Calling records from SRA: Call with cufflinks} \begin{minted}[showtabs,breaklines]{make} call: $(SRX)_genes.fpkm_tracking $(SRX)_genes.fpkm_tracking: $(SRX)_star.bam $(BOWTIE_INDEX_DIR)$(GTF) $(MODULE) load cufflinks/2.2.1; \ cufflinks -p $(CORES) -G $(wordlist 2,2,$^) $< for file in genes.fpkm_tracking isoforms.fpkm_tracking skipped.gtf transcripts.gtf; do \ mv $${file} $(SRX)_$${file}; \ done; \end{minted} %$ \begin{itemize} \item Use cufflinks to call \end{itemize} \end{frame} \begin{frame}[fragile]{Run it on biocluster} \begin{minted}{shell} ~donarm/uiuc_igb_scripts/dqsub --mem 70G \ --ppn 8 make call; \end{minted} \begin{itemize} \item dqsub is my own qsub wrapper which avoids me having to write little scripts for everything \item \url{http://git.donarmstrong.com/?p=uiuc_igb_scripts.git;a=blob;f=dqsub} \end{itemize} \end{frame} \section{Why not make?} \begin{frame}{Why not make?} \begin{itemize} \item Timestamps, not MD5sums \item Complicated workflows \item Interaction of rules can be complicated to understand \item Yet Another Language \end{itemize} \end{frame} \subsection{Timestamps} \begin{frame}[fragile]{Dealing with timestamps} \begin{minted}[showtabs,breaklines]{make} TARGET: PREREQ1 PREREQ1 if [ -e $@.tgt.md5sum ] && [ -e $@ ] \ && md5sum --status --check \ $@.tgt.md5sum; then \ touch $@; \ else \ RECIPE FOR $@; \ md5sum $^ > $@.tgt.md5sum; \ fi; \end{minted} % $ \begin{itemize} \item Make builds things on the basis of timestamps \item But what if the contents haven't changed and it's expensive to rebuild? \item Use md5sum! \end{itemize} \end{frame} \subsection{Complicated Workflows} \begin{frame}[fragile]{What about complicated workflows?} \begin{itemize} \item If your workflow is really complicated, what then? \item Use some other language to write your workflow in \item Use a simple makefile which just runs the workflow \end{itemize} \begin{minted}[showtabs,breaklines]{make} complicated_workflow_done: req1 req2 req3 ./complicated_workflow.sh $^; touch $@; \end{minted} \end{frame} \section{Further Resources} \begin{frame}{Further Resources} \begin{itemize} \item GNU Make Manual: \url{https://www.gnu.org/software/make/manual/} \item Mailing lists: \url{http://www.gnu.org/software/make/} \item Stack overflow: \url{http://stackoverflow.com/questions/tagged/make} \item Myself: \href{mailto:don@donarmstrong.com}{don@donarmstrong.com} \item This presentation: \url{http://git.donarmstrong.com/using\_make\_for\_science.git} \end{itemize} \end{frame} \end{document}