]> git.donarmstrong.com Git - scanner_workflow.git/commitdiff
add first draft of scanner workflow that calls ocrmypdf
authorDon Armstrong <don@donarmstrong.com>
Tue, 24 Jan 2023 05:57:55 +0000 (21:57 -0800)
committerDon Armstrong <don@donarmstrong.com>
Tue, 24 Jan 2023 05:57:55 +0000 (21:57 -0800)
.gitignore [new file with mode: 0644]
requirements.txt [new file with mode: 0644]
scanner_workflow.py [new file with mode: 0755]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..9560b42
--- /dev/null
@@ -0,0 +1,6 @@
+/failure
+/input
+/.lock
+/process
+/venv
+/output
diff --git a/requirements.txt b/requirements.txt
new file mode 100644 (file)
index 0000000..56354a4
--- /dev/null
@@ -0,0 +1,3 @@
+click
+watchdog
+filelock
diff --git a/scanner_workflow.py b/scanner_workflow.py
new file mode 100755 (executable)
index 0000000..a65decc
--- /dev/null
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+import click
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler, FileSystemEvent
+from pathlib import Path
+from typing import Union
+from filelock import Timeout, FileLock
+import subprocess
+from logging import error, info, debug, warning
+
+
+class ScannerWorkflowEvent(FileSystemEventHandler):
+    """Subclass of FileSystemEventHandler to handle OCRing PDFs"""
+
+    scanner_workflow = None
+
+    def __init__(self, scanner_workflow=None):
+        super().__init__()
+        self.scanner_workflow = scanner_workflow
+        if not self.scanner_workflow:
+            raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
+
+    def on_closed(self, event: FileSystemEvent):
+        if event.is_directory:
+            return
+        if not event.src_path.endswith(".pdf"):
+            return
+        self.scanner_workflow.process_pdf(event.src_path)
+
+
+class ScannerWorkflow:
+    base_dir = None
+    failure_dir = None
+    output_dir = None
+    lock_file = None
+    input_dir = None
+    process_dir = None
+    ocrmypdf_opts = ["-r", "-q", "--deskew", "--clean"]
+
+    def __init__(
+        self,
+        base_dir: Union[Path, str] = ".",
+        input_dir: Union[Path, str] = "input",
+        output_dir: Union[Path, str] = "output",
+        failure_dir: Union[Path, str] = "failure",
+        process_dir: Union[Path, str] = "process",
+        lock_file: Union[Path, str] = ".lock",
+    ):
+        def concat_if_not_abs(dir1: Path, dir2: Path):
+            if dir2.is_absolute():
+                return dir2
+            else:
+                return dir1 / dir2
+
+        super().__init__()
+        self.base_dir = Path(base_dir)
+        self.input_dir = concat_if_not_abs(self.base_dir, Path(input_dir))
+        self.output_dir = concat_if_not_abs(self.base_dir, Path(output_dir))
+        self.failure_dir = concat_if_not_abs(self.base_dir, Path(failure_dir))
+        self.process_dir = concat_if_not_abs(self.base_dir, Path(process_dir))
+        self.lock_file = concat_if_not_abs(self.base_dir, Path(lock_file))
+        self.lock = FileLock(self.lock_file)
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+        self.input_dir.mkdir(parents=True, exist_ok=True)
+        self.failure_dir.mkdir(parents=True, exist_ok=True)
+        self.process_dir.mkdir(parents=True, exist_ok=True)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+    def process_pdf(self, pdf_file: Union[Path, str]):
+        """Process a single PDF."""
+        pdf_file = Path(pdf_file)
+        # move to the processing directory
+        pdf_file = pdf_file.rename(self.process_dir / pdf_file.name)
+        res = subprocess.run(
+            ["ocrmypdf", *self.ocrmypdf_opts, pdf_file, self.output_dir / pdf_file.name]
+        )
+        if res.returncode != 0:
+            error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
+            return
+        pdf_file.unlink()
+
+    def event_loop(self):
+        """Main event loop; called from the command line."""
+        ev = ScannerWorkflowEvent(scanner_workflow=self)
+        observer = Observer()
+        observer.schedule(ev, self.input_dir, recursive=True)
+        observer.start()
+        # process any PDFs in input_dir
+        for file in self.input_dir.iterdir():
+            self.process_pdf(file)
+        try:
+            while observer.is_alive():
+                observer.join(1)
+        finally:
+            observer.stop()
+            observer.join()
+
+
+@click.command()
+@click.option(
+    "-i",
+    "--input-dir",
+    default="input",
+    help="Directory to look for incoming PDFs",
+)
+@click.option(
+    "-p",
+    "--process-dir",
+    default="process",
+    help="Directory to store PDFs being processed",
+)
+@click.option(
+    "-o",
+    "--output-dir",
+    default="output",
+    help="Directory to output OCRed PDFs",
+)
+@click.option(
+    "-f",
+    "--failure-dir",
+    default="failure",
+    help="Directory to store failed PDFs",
+)
+@click.option(
+    "-b",
+    "--base-dir",
+    default=".",
+    help="Base directory",
+)
+@click.option(
+    "-l",
+    "--lock-file",
+    default=".lock",
+    help="Lock file to ensure only one instance is running",
+)
+def cli(input_dir, process_dir, output_dir, failure_dir, base_dir, lock_file):
+    """OCR scanner output and save in directory"""
+    sw = ScannerWorkflow(
+        input_dir=input_dir,
+        process_dir=process_dir,
+        output_dir=output_dir,
+        failure_dir=failure_dir,
+        base_dir=base_dir,
+        lock_file=lock_file,
+    )
+    try:
+        with sw.lock.acquire(timeout=10):
+            sw.event_loop()
+    except Timeout:
+        print("Another instance holds the lock")
+        exit(1)
+
+
+cli()