]> git.donarmstrong.com Git - don.git/commitdiff
Merge branch 'master' of ssh://linnode.donarmstrong.com/sites/donarmstrong/don
authorDon Armstrong <don@donarmstrong.com>
Wed, 14 Jan 2015 01:29:01 +0000 (17:29 -0800)
committerDon Armstrong <don@donarmstrong.com>
Wed, 14 Jan 2015 01:29:01 +0000 (17:29 -0800)
posts/adding_toc_to_pdfs_in_R.mdwn [new file with mode: 0644]

diff --git a/posts/adding_toc_to_pdfs_in_R.mdwn b/posts/adding_toc_to_pdfs_in_R.mdwn
new file mode 100644 (file)
index 0000000..0b67f73
--- /dev/null
@@ -0,0 +1,83 @@
+[[!meta title="Adding a Table of Contents to PDFs from R"]]
+
+I routinely generate very large PDFs from R which have hundreds (or
+thousands) of pages, and navigating these pages can be very difficult.
+Unfortunately, neither R's pdf() nor its cairopdf() drivers support
+creating Table of Contents (or Index) while plots are being written
+out. In the case of cairo, the underlying library doesn't
+[support it either](http://osdir.com/ml/lib.cairo/2005-08/msg00506.html),
+so this isn't something that can easily be added to R directly. I had
+been thinking about sitting down for months and writing the support
+into cairo and R's cairo package... but real life kept getting in the way.
+
+Fast forward to a week ago, when I realized that `pdftk` does support
+dumping the table of contents and updating the table of contents using
+`dump_data_utf8` and `update_info_utf8`! Armed with that knowledge,
+and a bit of hackery, we can save an index, and then update the pdf
+once it's been closed.
+
+The R code then looks like the following:
+
+     ..device.set.up <- FALSE
+     ..current.page <<- 0
+     
+     save.bookmark <- function(text,bookmarks=list(),level=1,page=NULL) {
+         if (!..device.set.up) {
+             Cairo.onSave(device = dev.cur(),
+                          onSave=function(device,page){
+                              ..current.page <<- page
+                          })
+             ..device.set.up <<- TRUE
+         }
+         if (missing(page)|| is.null(page)) {
+             page <- ..current.page+1
+         }
+         bookmarks[[length(bookmarks)+1]] <-
+             list(text=text,
+                  level=level,
+                  page=page)
+         return(bookmarks)
+     }
+     
+     write.bookmarks <- function(pdf.file,bookmarks=list()) {
+         pdf.bookmarks <- ""
+         for (bookmark in 1:length(bookmarks)) {
+             pdf.bookmarks <-
+                 paste0(pdf.bookmarks,
+                        "BookmarkBegin\n",
+                        "BookmarkTitle: ",bookmarks[[bookmark]]$text,"\n",
+                        "BookmarkLevel: ",bookmarks[[bookmark]]$level,"\n",
+                        "BookmarkPageNumber: ",bookmarks[[bookmark]]$page,"\n")
+         }
+         temp.pdf <- tempfile(pattern=basename(pdf.file))
+         temp.pdf.info <- tempfile(pattern=paste0(basename(pdf.file),"info_utf8"))
+         cat(file=temp.pdf.info,pdf.bookmarks)
+         system2("pdftk",c(pdf.file,'update_info_utf8',temp.pdf.info,'output',temp.pdf))
+         if (file.exists(temp.pdf)) {
+             file.rename(temp.pdf,pdf.file)
+         } else {
+             warning("unable to properly create bookmarks")
+         }
+     }
+
+and can be used like so:
+
+     cairopdf(file="testing.pdf")
+     bookmarks <- list()
+     bookmarks <- save.bookmark("First plot",bookmarks)
+     plot(1:5,6:10)
+     bookmarks <- save.bookmark("Second plot",bookmarks)
+     plot(6:10,1:5)
+     dev.off()
+     write.bookmarks("testing.pdf",bookmarks)
+
+et voila. Bookmarks and a table of contents for PDFs.
+
+This basic methodology can be extended to any language which writes
+PDFs and does not have a built-in method for generating a Table of
+Contents. Currently, the usage of `Cairo.onSave` is a horrible hack,
+and may conflict with anything else which uses the onSave hook, but
+hopefully R will report the current page number from Cairo in the
+future.
+
+[[!tag tech r]]