From 76de8c4ca74449752786d28198c37022d01514d7 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Mon, 23 Jan 2023 21:57:55 -0800 Subject: [PATCH] add first draft of scanner workflow that calls ocrmypdf --- .gitignore | 6 ++ requirements.txt | 3 + scanner_workflow.py | 155 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 .gitignore create mode 100644 requirements.txt create mode 100755 scanner_workflow.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9560b42 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +/failure +/input +/.lock +/process +/venv +/output diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..56354a4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +click +watchdog +filelock diff --git a/scanner_workflow.py b/scanner_workflow.py new file mode 100755 index 0000000..a65decc --- /dev/null +++ b/scanner_workflow.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +import click +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler, FileSystemEvent +from pathlib import Path +from typing import Union +from filelock import Timeout, FileLock +import subprocess +from logging import error, info, debug, warning + + +class ScannerWorkflowEvent(FileSystemEventHandler): + """Subclass of FileSystemEventHandler to handle OCRing PDFs""" + + scanner_workflow = None + + def __init__(self, scanner_workflow=None): + super().__init__() + self.scanner_workflow = scanner_workflow + if not self.scanner_workflow: + raise Error("No scanner_workflow passed to ScannerWorkflowEvent") + + def on_closed(self, event: FileSystemEvent): + if event.is_directory: + return + if not event.src_path.endswith(".pdf"): + return + self.scanner_workflow.process_pdf(event.src_path) + + +class ScannerWorkflow: + base_dir = None + failure_dir = None + output_dir = None + lock_file = None + input_dir = None + process_dir = None + ocrmypdf_opts = ["-r", "-q", "--deskew", "--clean"] + + def __init__( + self, + base_dir: Union[Path, str] = ".", + input_dir: Union[Path, str] = "input", + output_dir: Union[Path, str] = "output", + failure_dir: Union[Path, str] = "failure", + process_dir: Union[Path, str] = "process", + lock_file: Union[Path, str] = ".lock", + ): + def concat_if_not_abs(dir1: Path, dir2: Path): + if dir2.is_absolute(): + return dir2 + else: + return dir1 / dir2 + + super().__init__() + self.base_dir = Path(base_dir) + self.input_dir = concat_if_not_abs(self.base_dir, Path(input_dir)) + self.output_dir = concat_if_not_abs(self.base_dir, Path(output_dir)) + self.failure_dir = concat_if_not_abs(self.base_dir, Path(failure_dir)) + self.process_dir = concat_if_not_abs(self.base_dir, Path(process_dir)) + self.lock_file = concat_if_not_abs(self.base_dir, Path(lock_file)) + self.lock = FileLock(self.lock_file) + self.base_dir.mkdir(parents=True, exist_ok=True) + self.input_dir.mkdir(parents=True, exist_ok=True) + self.failure_dir.mkdir(parents=True, exist_ok=True) + self.process_dir.mkdir(parents=True, exist_ok=True) + self.output_dir.mkdir(parents=True, exist_ok=True) + + def process_pdf(self, pdf_file: Union[Path, str]): + """Process a single PDF.""" + pdf_file = Path(pdf_file) + # move to the processing directory + pdf_file = pdf_file.rename(self.process_dir / pdf_file.name) + res = subprocess.run( + ["ocrmypdf", *self.ocrmypdf_opts, pdf_file, self.output_dir / pdf_file.name] + ) + if res.returncode != 0: + error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}") + return + pdf_file.unlink() + + def event_loop(self): + """Main event loop; called from the command line.""" + ev = ScannerWorkflowEvent(scanner_workflow=self) + observer = Observer() + observer.schedule(ev, self.input_dir, recursive=True) + observer.start() + # process any PDFs in input_dir + for file in self.input_dir.iterdir(): + self.process_pdf(file) + try: + while observer.is_alive(): + observer.join(1) + finally: + observer.stop() + observer.join() + + +@click.command() +@click.option( + "-i", + "--input-dir", + default="input", + help="Directory to look for incoming PDFs", +) +@click.option( + "-p", + "--process-dir", + default="process", + help="Directory to store PDFs being processed", +) +@click.option( + "-o", + "--output-dir", + default="output", + help="Directory to output OCRed PDFs", +) +@click.option( + "-f", + "--failure-dir", + default="failure", + help="Directory to store failed PDFs", +) +@click.option( + "-b", + "--base-dir", + default=".", + help="Base directory", +) +@click.option( + "-l", + "--lock-file", + default=".lock", + help="Lock file to ensure only one instance is running", +) +def cli(input_dir, process_dir, output_dir, failure_dir, base_dir, lock_file): + """OCR scanner output and save in directory""" + sw = ScannerWorkflow( + input_dir=input_dir, + process_dir=process_dir, + output_dir=output_dir, + failure_dir=failure_dir, + base_dir=base_dir, + lock_file=lock_file, + ) + try: + with sw.lock.acquire(timeout=10): + sw.event_loop() + except Timeout: + print("Another instance holds the lock") + exit(1) + + +cli() -- 2.39.2