X-Git-Url: https://git.donarmstrong.com/?p=using_make_for_science.git;a=blobdiff_plain;f=using_make_for_science.Rnw;h=8ff99d10871fb6470e71e3d43a677c3f6518bc0e;hp=79034541d234b605c9e182ba2f66137b58d8dce3;hb=965452df6fbe769c2733543b8880db096a1fb850;hpb=bdd63bfdd95e9ca6308584f316552d891fdd75f8 diff --git a/using_make_for_science.Rnw b/using_make_for_science.Rnw index 7903454..8ff99d1 100644 --- a/using_make_for_science.Rnw +++ b/using_make_for_science.Rnw @@ -17,6 +17,7 @@ \usepackage{tikz} \usepackage{nameref} \usepackage{zref-xr,zref-user} +\usepackage{metalogo} \IfFileExists{upquote.sty}{\usepackage{upquote}}{} \mode
{ \usepackage[x11names,svgnames,usenames,dvipsnames]{xcolor} @@ -42,6 +43,10 @@ } \usepackage{minted} +\usepackage{tcolorbox} +\usepackage{etoolbox} +\BeforeBeginEnvironment{minted}{\begin{tcolorbox}}% +\AfterEndEnvironment{minted}{\end{tcolorbox}}% \mode{ \usetheme{CambridgeUS} @@ -128,12 +133,31 @@ \item Gradle (Rake DSL) \item Leiningen (Clojure) \item Tweaker (task definitions in any language) + \item Ruffus (Pipeline library for python) \item \href{https://en.wikipedia.org/wiki/List_of_build_automation_software}{Wikipedia List of build automation software} \end{itemize} \end{frame} +\subsection{Why use GNU make?} +\begin{frame}{Why use GNU make?} + \begin{itemize} + \item Ubiquitous -- any machine which you can run command line tools + on has GNU make available. + \item Large community -- lots of people use GNU make. It's well + understood and you can get questions answered + \item Simple rules -- all of the rules are in a simple text file + which is easily edited and version controlled + \item Reasonable debugging -- you can see the commands that make is + going to run fairly easily: \mintinline{shell}{make -n tgt;} + \item Parallel -- make can make targets in parallel: + \mintinline{shell}{make -j8 all;} + \item Language agnostic -- make doesn't care what language your code + is written in + \end{itemize} +\end{frame} + \section{Introduction to Makefiles} \begin{frame}[fragile]{Simple Makefile} @@ -141,46 +165,293 @@ hello_world: echo "hello world" > hello_world \end{minted} + \end{frame} \subsection{General Syntax} - \begin{frame}[fragile]{Simple Makefile} \begin{minted}[showtabs]{make} -hello_world: - echo "hello world" > hello_world +TARGETS: PREREQUISITES + RECIPE \end{minted} +\begin{itemize} +\item TARGETS are file names separated by spaces +\item PREREQUISITES are file names separated by spaces. +\item RECIPE lines start with a tab, are executed by the shell and + describe how to make the TARGETS (generally from the PREREQUISITES) +\item A TARGET is out of date if it does not exist or if it is older + than any of the prerequisites. +\end{itemize} \end{frame} \subsection{Variables} +\begin{frame}[fragile]{Some Variables} +\begin{itemize} +\item Two flavors of variables + \begin{itemize} + \item \mintinline{make}{FOO=bar} -- recursively expanded variables; + references to other variables are expanded at the time this + variable is expanded + \item \mintinline{make}{FOO:=bar} -- simply expanded variables; the + value is assigned at the moment the variable is created + \end{itemize} +\item Variables can come from the environment and can be overridden on + the command line: \mintinline{shell}{make FOO=bleargh} or + \mintinline{shell}{FOO=blah make}. +\item \mintinline{make}{$@} -- tgt name %$ +\item \mintinline{make}{$*} -- current stem %$ +\item \mintinline{make}{$^} -- all prerequisites %$ +\item \mintinline{make}{$<} -- first prerequisite %$ +\item \mintinline{make}{$(FOO)} -- how variables are referenced %$ +\end{itemize} +\end{frame} + +\subsection{Functions} + +\begin{frame}[fragile]{Some Functions} + \begin{itemize} + \item \mintinline{make}{$(patsubst %.bam,%.sam,foo.sam bar.sam)} %$ + -- returns foo.bam bar.bam. + \item \mintinline{make}{$(filter-out %.bam,foo.sam bar.bam)} %$ + -- returns foo.sam + \item \mintinline{make}{$(words foo.sam bar.bam)} %$ + -- returns the number of words in its argument (2) + \item \mintinline{make}{$(wordlist 1,2,foo.sam bar.bam bleargh.foo)} %$ + -- returns the words in its last argument starting with the 1st + and ending with the second. + \end{itemize} +\end{frame} + \subsection{Rules} \subsubsection{Default Target} +\begin{frame}[fragile]{How does make know what to build?} +\begin{minted}[showtabs]{make} +first_tgt: + touch $@ +second_tgt: first_tgt + touch $@ +\end{minted} + \begin{itemize} + \item By default, make builds the first tgt. + \item You can specify a specific tgt to build on the command line + (\mintinline{shell}{make first_tgt}). + \item You can change the default tgt by using the variable + \mintinline{make}{.DEFAULT_GOAL := second_tgt} + \end{itemize} +\end{frame} + + \subsubsection{Special Targets} +\begin{frame}[fragile]{Special Targets} +\begin{minted}[showtabs]{make} +.PHONY: clean + +clean: + rm -f first_tgt second_tgt +\end{minted} + \begin{itemize} + \item \mintinline{make}{.PHONY} -- any time make considers this + tgt, it is run unconditionally, even if a file exists. + \item \mintinline{make}{.ONESHELL} -- when a tgt is built, all + lines will be given to a single invocation of the shell. + \item Lots of other special targets which are not described here. + \end{itemize} +\end{frame} + + \subsubsection{Pattern Rules} -\subsection{Functions} +\begin{frame}[fragile]{Special Targets} +\begin{minted}[showtabs]{make} +%.fasta.gz: %.fasta + gzip -dc $< > $@ + +%.bam: %.sam + samtools view -b -o $@ $< +\end{minted} + \begin{itemize} + \item \% is the pattern stem which is accessible by + \mintinline{make}{$*} %$ within rules + \item The first rule uncompresses fasta files + \item The second rule turns sam files into bam files + \end{itemize} +\end{frame} \section{Examples} \subsection{This Presentation} -\subsection{Can you dig it?} +\begin{frame}[fragile]{How this presentation is made} +\inputminted[showtabs]{make}{Makefile} + \begin{itemize} + \item all is the default tgt + \item Make .tex files from the knitr source. + \item The third rule uses latexmk to build the pdf using \XeLaTeX. + \end{itemize} +\end{frame} \subsection{Calling records from SRA} +\begin{frame}{Calling records from SRA: The problem} + \begin{itemize} + \item ≈200 tissue samples from Roadmap Epigenomics + \item No consistent workflow + \item Reanalyze them all using STAR and cufflinks + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Calling records from SRA: Downloading} +\begin{minted}[showtabs,breaklines]{make} +SRX=SRX007165 +SRRS=SRR020291 SRR020290 +NREADS=1 +SRR_FILES=$(patsubst %,%.sra,$(SRRS)) + +get_srr: $(SRR_FILES) + +$(SRR_FILES): %.sra: + rsync -avP "rsync://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/$(shell echo -n $*|sed 's/\(SRR[0-9][0-9][0-9]\).*/\1/')/$*/$*.sra" $@; +\end{minted} +%$ + \begin{itemize} + \item First three lines are actually generated by other code and + included + \item Download all of the SRA files using rsync + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Calling records from SRA: Dumping fastq} +\begin{minted}[showtabs,breaklines]{make} +ifeq ($(NREADS),1) +FASTQ_FILES:=$(patsubst %,%.fastq.gz,$(SRRS)) +else +FASTQ_FILES:=$(patsubst %,%_1.fastq.gz,$(SRRS)) $(patsubst %,%_2.fastq.gz,$(SRRS)) +endif + +make_fastq: $(FASTQ_FILES) + +ifeq ($(NREADS),1) +$(FASTQ_FILES): %.fastq.gz: %.sra +else +%_1.fastq.gz %_2.fastq.gz: %.sra +endif + $(MODULE) load sratoolkit/2.3.5-2; \ + fastq-dump --split-3 --gzip $^; +\end{minted} +%$ + \begin{itemize} + \item Call fastq-dump to dump the fastq files + \item Handles NREADS of 1 and 2 differently + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Calling records from SRA: Align with STAR} +\begin{minted}[showtabs,breaklines]{make} +$(SRX)_star.bam: + $(MODULE) load STAR/2.4.2a; \ + mkdir -p $(SRX)_star; \ + STAR --outFileNamePrefix $(SRX)_star/ \ + --outSAMtype BAM SortedByCoordinate \ + --runThreadN $(CORES) \ + --outSAMstrandField intronMotif \ + --genomeDir $(STAR_INDEX_DIR) \ + --readFilesCommand "gzip -dc" \ + --readFilesIn $(TOPHAT_FASTQ_ARGUMENT); + ln $(SRX)_star/Aligned.*.bam $@ -s +\end{minted} +%$ + \begin{itemize} + \item Call STAR with lots of options to do the alignment + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Calling records from SRA: Call with cufflinks} +\begin{minted}[showtabs,breaklines]{make} +call: $(SRX)_genes.fpkm_tracking + +$(SRX)_genes.fpkm_tracking: $(SRX)_star.bam $(BOWTIE_INDEX_DIR)$(GTF) + $(MODULE) load cufflinks/2.2.1; \ + cufflinks -p $(CORES) -G $(wordlist 2,2,$^) $< + for file in genes.fpkm_tracking isoforms.fpkm_tracking skipped.gtf transcripts.gtf; do \ + mv $${file} $(SRX)_$${file}; \ + done; +\end{minted} +%$ + \begin{itemize} + \item Use cufflinks to call + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Run it on biocluster} + \begin{minted}{shell} +~donarm/uiuc_igb_scripts/dqsub --mem 70G \ + --ppn 8 make call; +\end{minted} +\begin{itemize} +\item dqsub is my own qsub wrapper which avoids me having to write + little scripts for everything +\item \url{http://git.donarmstrong.com/?p=uiuc_igb_scripts.git;a=blob;f=dqsub} +\end{itemize} +\end{frame} + \section{Why not make?} +\begin{frame}{Why not make?} + \begin{itemize} + \item Timestamps, not MD5sums + \item Complicated workflows + \end{itemize} +\end{frame} + \subsection{Timestamps} +\begin{frame}[fragile]{Dealing with timestamps} + \begin{minted}[showtabs,breaklines]{make} +TARGET: PREREQ1 PREREQ1 + if [ -e $@.tgt.md5sum ] && [ -e $@ ] \ + && md5sum --status --check \ + $@.tgt.md5sum; then \ + touch $@; \ + else \ + RECIPE FOR $@; \ + md5sum $^ > $@.tgt.md5sum; \ + fi; +\end{minted} +% $ +\begin{itemize} +\item Make builds things on the basis of timestamps +\item But what if the contents haven't changed and it's expensive to + rebuild? +\item Use md5sum! +\end{itemize} +\end{frame} + \subsection{Complicated Workflows} +\begin{frame}{What about complicated workflows?} + \begin{itemize} + \item If your workflow is really complicated, what then? + \item Use some other language to write your workflow in + \item Use a simple makefile which just runs the workflow + \end{itemize} +\end{frame} + \section{Further Resources} +\begin{frame}{Further Resources} + \begin{itemize} + \item GNU Make Manual: \url{https://www.gnu.org/software/make/manual/} + \item Mailing lists: \url{http://www.gnu.org/software/make/} + \item Stack overflow: \url{http://stackoverflow.com/questions/tagged/make} + \item Myself: \href{mailto:don@donarmstrong.com}{don@donarmstrong.com} + \item This presentation: \url{http://git.donarmstrong.com/using\_make\_for\_science.git} + \end{itemize} +\end{frame} \end{document}