X-Git-Url: https://git.donarmstrong.com/?p=using_make_for_science.git;a=blobdiff_plain;f=using_make_for_science.Rnw;h=12ce5e7f07bf7c07053283d722fc0a1e2b9733b5;hp=f418d490cc5469f5e1a98d21f88295d5c410ccfe;hb=bc49471caf76b388c9949075085030e7ce28badc;hpb=7bef02dcc059cc1de1390d9232da4e3868dda207 diff --git a/using_make_for_science.Rnw b/using_make_for_science.Rnw index f418d49..12ce5e7 100644 --- a/using_make_for_science.Rnw +++ b/using_make_for_science.Rnw @@ -77,7 +77,7 @@ \setbeamercolor*{fine separation line}{} \setbeamercovered{transparent} \logo{\begin{tikzpicture}% Pale figure - {\node[opacity=0.7]{\IfFileExists{./logo.pdf}{\includegraphics[height=1.5cm]{logo.pdf}}{}% + {\node[opacity=0.7]{\IfFileExists{./logo.pdf}{\includegraphics[height=1cm,width=1cm,keepaspectratio]{logo.pdf}}{}% };}% \end{tikzpicture}} } @@ -89,6 +89,8 @@ \subject{make for science} \begin{document} +\IfFileExists{./relevant_xkcd.png}{\frame[plain]{\centering \includegraphics[width=\textwidth,height=\textheight,keepaspectratio]{./relevant_xkcd.png}}} + \frame[plain]{\titlepage} \mode
{\maketitle} @@ -133,12 +135,31 @@ \item Gradle (Rake DSL) \item Leiningen (Clojure) \item Tweaker (task definitions in any language) + \item Ruffus (Pipeline library for python) \item \href{https://en.wikipedia.org/wiki/List_of_build_automation_software}{Wikipedia List of build automation software} \end{itemize} \end{frame} +\subsection{Why use GNU make?} +\begin{frame}{Why use GNU make?} + \begin{itemize} + \item Ubiquitous -- any machine which you can run command line tools + on has GNU make available. + \item Large community -- lots of people use GNU make. It's not going + to go away tomorrow. + \item Simple rules -- all of the rules are in a simple text file + which is easily edited and version controlled + \item Reasonable debugging -- you can see the commands that make is + going to run fairly easily: \mintinline{shell}{make -n target;} + \item Parallel -- make can make targets in parallel: + \mintinline{shell}{make -j8 all;} + \item Language agnostic -- make doesn't care what language your code + is written in + \end{itemize} +\end{frame} + \section{Introduction to Makefiles} \begin{frame}[fragile]{Simple Makefile} @@ -269,9 +290,12 @@ clean: \subsection{This Presentation} \begin{frame}[fragile]{How this presentation is made} -\inputminted[showtabs]{make}{Makefile} +\inputminted[showtabs,breaklines,firstline=3]{make}{Makefile} +\end{frame} +\begin{frame}[fragile]{How this presentation is made} \begin{itemize} - \item all is the default target + \item all is the default + \item Download the optional relevant\_xkcd.png \item Make .tex files from the knitr source. \item The third rule uses latexmk to build the pdf using \XeLaTeX. \end{itemize} @@ -290,7 +314,7 @@ clean: \begin{frame}[fragile]{Calling records from SRA: Downloading} \begin{minted}[showtabs,breaklines]{make} SRX=SRX007165 -SRRS=SRR020291 SRR020290 SRR020288 SRR020287 SRR020289 +SRRS=SRR020291 SRR020290 NREADS=1 SRR_FILES=$(patsubst %,%.sra,$(SRRS)) @@ -298,7 +322,8 @@ get_srr: $(SRR_FILES) $(SRR_FILES): %.sra: rsync -avP "rsync://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/$(shell echo -n $*|sed 's/\(SRR[0-9][0-9][0-9]\).*/\1/')/$*/$*.sra" $@; -\end{minted} %$ +\end{minted} +%$ \begin{itemize} \item First three lines are actually generated by other code and included @@ -315,7 +340,17 @@ FASTQ_FILES:=$(patsubst %,%_1.fastq.gz,$(SRRS)) $(patsubst %,%_2.fastq.gz,$(SRR endif make_fastq: $(FASTQ_FILES) +\end{minted} +%$ +\begin{itemize} +\item Use ifeq/else/endif to handle paired reads differently from + unpaired reads +\item FASTQ\_FILES is the full set of fastq files dumped from the SRAs. +\end{itemize} +\end{frame} +\begin{frame}[fragile]{Calling records from SRA: Dumping fastq #2} +\begin{minted}[showtabs,breaklines]{make} ifeq ($(NREADS),1) $(FASTQ_FILES): %.fastq.gz: %.sra else @@ -323,10 +358,11 @@ else endif $(MODULE) load sratoolkit/2.3.5-2; \ fastq-dump --split-3 --gzip $^; -\end{minted} %$ +\end{minted} +%$ \begin{itemize} - \item Call fastq-dump to dump the fastq files \item Handles NREADS of 1 and 2 differently + \item Call fastq-dump to dump the fastq files \end{itemize} \end{frame} @@ -336,14 +372,15 @@ $(SRX)_star.bam: $(MODULE) load STAR/2.4.2a; \ mkdir -p $(SRX)_star; \ STAR --outFileNamePrefix $(SRX)_star/ \ - --outSAMtype BAM SortedByCoordinate \ - --runThreadN $(CORES) \ - --outSAMstrandField intronMotif \ - --genomeDir $(STAR_INDEX_DIR) \ - --readFilesCommand "gzip -dc" \ - --readFilesIn $(TOPHAT_FASTQ_ARGUMENT); - ln $(SRX)_star/Aligned.sortedByCoord.out.bam $@ -s -\end{minted} %$ + --outSAMtype BAM SortedByCoordinate \ + --runThreadN $(CORES) \ + --outSAMstrandField intronMotif \ + --genomeDir $(STAR_INDEX_DIR) \ + --readFilesCommand "gzip -dc" \ + --readFilesIn $(TOPHAT_FASTQ_ARGUMENT); + ln $(SRX)_star/Aligned.*.bam $@ -s +\end{minted} +%$ \begin{itemize} \item Call STAR with lots of options to do the alignment \end{itemize} @@ -359,31 +396,84 @@ $(SRX)_genes.fpkm_tracking: $(SRX)_star.bam $(BOWTIE_INDEX_DIR)$(GTF) for file in genes.fpkm_tracking isoforms.fpkm_tracking skipped.gtf transcripts.gtf; do \ mv $${file} $(SRX)_$${file}; \ done; -\end{minted} %$ +\end{minted} +%$ \begin{itemize} \item Use cufflinks to call \end{itemize} \end{frame} \begin{frame}[fragile]{Run it on biocluster} - \begin{minted}[shell] -for a in SRX*/Makefile; do - (cd $(dirname $a); - MAKE_TARGET=call qsub -q default -v MAKE_TARGET -S /bin/bash \ - -d "$(pwd)" -l "nodes=1:ppn=8,mem=12G" \ - ~donarm/uiuc_igb_scripts/run_make - ); -done; -\end{minted} %$ + \begin{minted}{shell} +~donarm/uiuc_igb_scripts/dqsub --mem 70G \ + --ppn 8 make call; +\end{minted} +\begin{itemize} +\item dqsub is my own qsub wrapper which avoids me having to write + little scripts for everything +\item \url{http://git.donarmstrong.com/?p=uiuc_igb_scripts.git;a=blob;f=dqsub} +\end{itemize} \end{frame} \section{Why not make?} +\begin{frame}{Why not make?} + \begin{itemize} + \item Timestamps, not MD5sums + \item Complicated workflows + \item Interaction of rules can be complicated to understand + \item Yet Another Language + \end{itemize} +\end{frame} + \subsection{Timestamps} +\begin{frame}[fragile]{Dealing with timestamps} + \begin{minted}[showtabs,breaklines]{make} +TARGET: PREREQ1 PREREQ1 + if [ -e $@.tgt.md5sum ] && [ -e $@ ] \ + && md5sum --status --check \ + $@.tgt.md5sum; then \ + touch $@; \ + else \ + RECIPE FOR $@; \ + md5sum $^ > $@.tgt.md5sum; \ + fi; +\end{minted} +% $ +\begin{itemize} +\item Make builds things on the basis of timestamps +\item But what if the contents haven't changed and it's expensive to + rebuild? +\item Use md5sum! +\end{itemize} +\end{frame} + \subsection{Complicated Workflows} +\begin{frame}[fragile]{What about complicated workflows?} + \begin{itemize} + \item If your workflow is really complicated, what then? + \item Use some other language to write your workflow in + \item Use a simple makefile which just runs the workflow + \end{itemize} + \begin{minted}[showtabs,breaklines]{make} +complicated_workflow_done: req1 req2 req3 + ./complicated_workflow.sh $^; + touch $@; +\end{minted} +\end{frame} + \section{Further Resources} +\begin{frame}{Further Resources} + \begin{itemize} + \item GNU Make Manual: \url{https://www.gnu.org/software/make/manual/} + \item Mailing lists: \url{http://www.gnu.org/software/make/} + \item Stack overflow: \url{http://stackoverflow.com/questions/tagged/make} + \item Myself: \href{mailto:don@donarmstrong.com}{don@donarmstrong.com} + \item This presentation: \url{http://git.donarmstrong.com/using\_make\_for\_science.git} + \end{itemize} +\end{frame} \end{document}