]> git.donarmstrong.com Git - bin.git/commitdiff
add make_ocr_pdf command
authorDon Armstrong <don@donarmstrong.com>
Thu, 7 Jun 2018 20:33:08 +0000 (13:33 -0700)
committerDon Armstrong <don@donarmstrong.com>
Thu, 7 Jun 2018 20:33:08 +0000 (13:33 -0700)
make_ocr_pdf [new file with mode: 0755]

diff --git a/make_ocr_pdf b/make_ocr_pdf
new file mode 100755 (executable)
index 0000000..753d69f
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# Copyright 2018 by Don Armstrong <don@donarmstrong.com>
+# Licensed under the terms of the GPL version 3 or any later version at your option.
+
+
+import argparse
+import subprocess
+import re
+
+parser = argparse.ArgumentParser(description="Make a PDF from TIFF files with OCR")
+
+parser.add_argument('--output','-f',default='output.pdf')
+parser.add_argument('tiff',nargs='+')
+args = parser.parse_args()
+# combine the tiff files into a multi-image tiff
+subprocess.run(['e2mtiff','--output','temp.tiff']+args.tiff,check=True)
+# tesseract appends pdf when we write to an output for some reason
+output_name = re.sub(r"\.pdf$","",args.output)
+subprocess.run(['tesseract','temp.tiff',output_name,'pdf'],check=True)
+