X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=using_make_for_science.Rnw;h=4513c1101655e344357f5326fd3cb35eb6388caf;hb=18243c36fa98d8dccb0e7a496b842c66edd72b59;hp=4e0ae036915988317ebc8e1a3fcabf37265ad0ef;hpb=f7262f641eff7f5477fb20cbcdd69601150b74eb;p=using_make_for_science.git diff --git a/using_make_for_science.Rnw b/using_make_for_science.Rnw index 4e0ae03..4513c11 100644 --- a/using_make_for_science.Rnw +++ b/using_make_for_science.Rnw @@ -77,7 +77,7 @@ \setbeamercolor*{fine separation line}{} \setbeamercovered{transparent} \logo{\begin{tikzpicture}% Pale figure - {\node[opacity=0.7]{\IfFileExists{./logo.pdf}{\includegraphics[height=1.5cm]{logo.pdf}}{}% + {\node[opacity=0.7]{\IfFileExists{./logo.pdf}{\includegraphics[height=1cm,width=1cm,keepaspectratio]{logo.pdf}}{}% };}% \end{tikzpicture}} } @@ -89,6 +89,8 @@ \subject{make for science} \begin{document} +\IfFileExists{./relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{./relevant_xkcd.png}}} + \frame[plain]{\titlepage} \mode
{\maketitle} @@ -133,12 +135,31 @@ \item Gradle (Rake DSL) \item Leiningen (Clojure) \item Tweaker (task definitions in any language) + \item Ruffus (Pipeline library for python) \item \href{https://en.wikipedia.org/wiki/List_of_build_automation_software}{Wikipedia List of build automation software} \end{itemize} \end{frame} +\subsection{Why use GNU make?} +\begin{frame}{Why use GNU make?} + \begin{itemize} + \item Ubiquitous -- any machine which you can run command line tools + on has GNU make available. + \item Large community -- lots of people use GNU make. It's not going + to go away tomorrow. + \item Simple rules -- all of the rules are in a simple text file + which is easily edited and version controlled + \item Reasonable debugging -- you can see the commands that make is + going to run fairly easily: \mintinline{shell}{make -n target;} + \item Parallel -- make can make targets in parallel: + \mintinline{shell}{make -j8 all;} + \item Language agnostic -- make doesn't care what language your code + is written in + \end{itemize} +\end{frame} + \section{Introduction to Makefiles} \begin{frame}[fragile]{Simple Makefile} @@ -193,7 +214,7 @@ TARGETS: PREREQUISITES \begin{frame}[fragile]{Some Functions} \begin{itemize} - \item \mintinline{make}{$(patsubst %.bam,%.sam,foo.sam bar.sam)} %$ + \item \mintinline{make}{$(patsubst %.sam,%.bam,foo.sam bar.sam)} %$ -- returns foo.bam bar.bam. \item \mintinline{make}{$(filter-out %.bam,foo.sam bar.bam)} %$ -- returns foo.sam @@ -250,7 +271,7 @@ clean: \begin{frame}[fragile]{Special Targets} \begin{minted}[showtabs]{make} -%.fasta.gz: %.fasta +%.fasta: %.fasta.gz gzip -dc $< > $@ %.bam: %.sam @@ -269,9 +290,12 @@ clean: \subsection{This Presentation} \begin{frame}[fragile]{How this presentation is made} -\inputminted[showtabs]{make}{Makefile} +\inputminted[showtabs,breaklines,firstline=3]{make}{Makefile} +\end{frame} +\begin{frame}[fragile]{How this presentation is made} \begin{itemize} - \item all is the default target + \item all is the default + \item Download the optional relevant\_xkcd.png \item Make .tex files from the knitr source. \item The third rule uses latexmk to build the pdf using \XeLaTeX. \end{itemize} @@ -290,7 +314,7 @@ clean: \begin{frame}[fragile]{Calling records from SRA: Downloading} \begin{minted}[showtabs,breaklines]{make} SRX=SRX007165 -SRRS=SRR020291 SRR020290 SRR020288 SRR020287 SRR020289 +SRRS=SRR020291 SRR020290 NREADS=1 SRR_FILES=$(patsubst %,%.sra,$(SRRS)) @@ -298,7 +322,8 @@ get_srr: $(SRR_FILES) $(SRR_FILES): %.sra: rsync -avP "rsync://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/$(shell echo -n $*|sed 's/\(SRR[0-9][0-9][0-9]\).*/\1/')/$*/$*.sra" $@; -\end{minted} %$ +\end{minted} +%$ \begin{itemize} \item First three lines are actually generated by other code and included @@ -315,7 +340,17 @@ FASTQ_FILES:=$(patsubst %,%_1.fastq.gz,$(SRRS)) $(patsubst %,%_2.fastq.gz,$(SRR endif make_fastq: $(FASTQ_FILES) +\end{minted} +%$ +\begin{itemize} +\item Use ifeq/else/endif to handle paired reads differently from + unpaired reads +\item FASTQ\_FILES is the full set of fastq files dumped from the SRAs. +\end{itemize} +\end{frame} +\begin{frame}[fragile]{Calling records from SRA: Dumping fastq #2} +\begin{minted}[showtabs,breaklines]{make} ifeq ($(NREADS),1) $(FASTQ_FILES): %.fastq.gz: %.sra else @@ -323,10 +358,11 @@ else endif $(MODULE) load sratoolkit/2.3.5-2; \ fastq-dump --split-3 --gzip $^; -\end{minted} %$ +\end{minted} +%$ \begin{itemize} - \item Call fastq-dump to dump the fastq files \item Handles NREADS of 1 and 2 differently + \item Call fastq-dump to dump the fastq files \end{itemize} \end{frame} @@ -343,7 +379,8 @@ $(SRX)_star.bam: --readFilesCommand "gzip -dc" \ --readFilesIn $(TOPHAT_FASTQ_ARGUMENT); ln $(SRX)_star/Aligned.*.bam $@ -s -\end{minted} %$ +\end{minted} +%$ \begin{itemize} \item Call STAR with lots of options to do the alignment \end{itemize} @@ -359,7 +396,8 @@ $(SRX)_genes.fpkm_tracking: $(SRX)_star.bam $(BOWTIE_INDEX_DIR)$(GTF) for file in genes.fpkm_tracking isoforms.fpkm_tracking skipped.gtf transcripts.gtf; do \ mv $${file} $(SRX)_$${file}; \ done; -\end{minted} %$ +\end{minted} +%$ \begin{itemize} \item Use cufflinks to call \end{itemize} @@ -367,18 +405,13 @@ $(SRX)_genes.fpkm_tracking: $(SRX)_star.bam $(BOWTIE_INDEX_DIR)$(GTF) \begin{frame}[fragile]{Run it on biocluster} \begin{minted}{shell} -for a in SRX*/Makefile; do - (cd $(dirname $a); - MAKE_TARGET=call qsub -q default \ - -v MAKE_TARGET -S /bin/bash \ - -d "$(pwd)" -l "nodes=1:ppn=8,mem=70G" \ - ~donarm/uiuc_igb_scripts/run_make - ); -done; -\end{minted} %$ +~donarm/uiuc_igb_scripts/dqsub --mem 70G \ + --ppn 8 make call; +\end{minted} \begin{itemize} -\item Use qsub and some shell scripting to run everything -\item Uses a special script which calls make with a specific target +\item dqsub is my own qsub wrapper which avoids me having to write + little scripts for everything +\item \url{http://git.donarmstrong.com/?p=uiuc_igb_scripts.git;a=blob;f=dqsub} \end{itemize} \end{frame} @@ -388,23 +421,59 @@ done; \begin{itemize} \item Timestamps, not MD5sums \item Complicated workflows + \item Interaction of rules can be complicated to understand + \item Yet Another Language \end{itemize} \end{frame} \subsection{Timestamps} -\begin{frame}{Dealing with timestamps} +\begin{frame}[fragile]{Dealing with timestamps} + \begin{minted}[showtabs,breaklines]{make} +TARGET: PREREQ1 PREREQ1 + if [ -e $@.tgt.md5sum ] && [ -e $@ ] \ + && md5sum --status --check \ + $@.tgt.md5sum; then \ + touch $@; \ + else \ + RECIPE FOR $@; \ + md5sum $^ > $@.tgt.md5sum; \ + fi; +\end{minted} +% $ +\begin{itemize} +\item Make builds things on the basis of timestamps +\item But what if the contents haven't changed and it's expensive to + rebuild? +\item Use md5sum! +\end{itemize} \end{frame} \subsection{Complicated Workflows} -\begin{frame}{Complicated Workflow} - +\begin{frame}[fragile]{What about complicated workflows?} + \begin{itemize} + \item If your workflow is really complicated, what then? + \item Use some other language to write your workflow in + \item Use a simple makefile which just runs the workflow + \end{itemize} + \begin{minted}[showtabs,breaklines]{make} +complicated_workflow_done: req1 req2 req3 + ./complicated_workflow.sh $^; + touch $@; +\end{minted} \end{frame} \section{Further Resources} \begin{frame}{Further Resources} + \begin{itemize} + \item GNU Make Manual: \url{https://www.gnu.org/software/make/manual/} + \item Mailing lists: \url{http://www.gnu.org/software/make/} + \item Stack overflow: \url{http://stackoverflow.com/questions/tagged/make} + \item Myself: \href{mailto:don@donarmstrong.com}{don@donarmstrong.com} + \item This presentation: \url{http://git.donarmstrong.com/using\_make\_for\_science.git} + \end{itemize} \end{frame} \end{document}