--- /dev/null
+#!/usr/bin/env python3
+
+import click
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler, FileSystemEvent
+from pathlib import Path
+from typing import Union
+from filelock import Timeout, FileLock
+import subprocess
+from logging import error, info, debug, warning
+
+
+class ScannerWorkflowEvent(FileSystemEventHandler):
+ """Subclass of FileSystemEventHandler to handle OCRing PDFs"""
+
+ scanner_workflow = None
+
+ def __init__(self, scanner_workflow=None):
+ super().__init__()
+ self.scanner_workflow = scanner_workflow
+ if not self.scanner_workflow:
+ raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
+
+ def on_closed(self, event: FileSystemEvent):
+ if event.is_directory:
+ return
+ if not event.src_path.endswith(".pdf"):
+ return
+ self.scanner_workflow.process_pdf(event.src_path)
+
+
+class ScannerWorkflow:
+ base_dir = None
+ failure_dir = None
+ output_dir = None
+ lock_file = None
+ input_dir = None
+ process_dir = None
+ ocrmypdf_opts = ["-r", "-q", "--deskew", "--clean"]
+
+ def __init__(
+ self,
+ base_dir: Union[Path, str] = ".",
+ input_dir: Union[Path, str] = "input",
+ output_dir: Union[Path, str] = "output",
+ failure_dir: Union[Path, str] = "failure",
+ process_dir: Union[Path, str] = "process",
+ lock_file: Union[Path, str] = ".lock",
+ ):
+ def concat_if_not_abs(dir1: Path, dir2: Path):
+ if dir2.is_absolute():
+ return dir2
+ else:
+ return dir1 / dir2
+
+ super().__init__()
+ self.base_dir = Path(base_dir)
+ self.input_dir = concat_if_not_abs(self.base_dir, Path(input_dir))
+ self.output_dir = concat_if_not_abs(self.base_dir, Path(output_dir))
+ self.failure_dir = concat_if_not_abs(self.base_dir, Path(failure_dir))
+ self.process_dir = concat_if_not_abs(self.base_dir, Path(process_dir))
+ self.lock_file = concat_if_not_abs(self.base_dir, Path(lock_file))
+ self.lock = FileLock(self.lock_file)
+ self.base_dir.mkdir(parents=True, exist_ok=True)
+ self.input_dir.mkdir(parents=True, exist_ok=True)
+ self.failure_dir.mkdir(parents=True, exist_ok=True)
+ self.process_dir.mkdir(parents=True, exist_ok=True)
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+
+ def process_pdf(self, pdf_file: Union[Path, str]):
+ """Process a single PDF."""
+ pdf_file = Path(pdf_file)
+ # move to the processing directory
+ pdf_file = pdf_file.rename(self.process_dir / pdf_file.name)
+ res = subprocess.run(
+ ["ocrmypdf", *self.ocrmypdf_opts, pdf_file, self.output_dir / pdf_file.name]
+ )
+ if res.returncode != 0:
+ error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
+ return
+ pdf_file.unlink()
+
+ def event_loop(self):
+ """Main event loop; called from the command line."""
+ ev = ScannerWorkflowEvent(scanner_workflow=self)
+ observer = Observer()
+ observer.schedule(ev, self.input_dir, recursive=True)
+ observer.start()
+ # process any PDFs in input_dir
+ for file in self.input_dir.iterdir():
+ self.process_pdf(file)
+ try:
+ while observer.is_alive():
+ observer.join(1)
+ finally:
+ observer.stop()
+ observer.join()
+
+
+@click.command()
+@click.option(
+ "-i",
+ "--input-dir",
+ default="input",
+ help="Directory to look for incoming PDFs",
+)
+@click.option(
+ "-p",
+ "--process-dir",
+ default="process",
+ help="Directory to store PDFs being processed",
+)
+@click.option(
+ "-o",
+ "--output-dir",
+ default="output",
+ help="Directory to output OCRed PDFs",
+)
+@click.option(
+ "-f",
+ "--failure-dir",
+ default="failure",
+ help="Directory to store failed PDFs",
+)
+@click.option(
+ "-b",
+ "--base-dir",
+ default=".",
+ help="Base directory",
+)
+@click.option(
+ "-l",
+ "--lock-file",
+ default=".lock",
+ help="Lock file to ensure only one instance is running",
+)
+def cli(input_dir, process_dir, output_dir, failure_dir, base_dir, lock_file):
+ """OCR scanner output and save in directory"""
+ sw = ScannerWorkflow(
+ input_dir=input_dir,
+ process_dir=process_dir,
+ output_dir=output_dir,
+ failure_dir=failure_dir,
+ base_dir=base_dir,
+ lock_file=lock_file,
+ )
+ try:
+ with sw.lock.acquire(timeout=10):
+ sw.event_loop()
+ except Timeout:
+ print("Another instance holds the lock")
+ exit(1)
+
+
+cli()