4 from watchdog.observers import Observer
5 from watchdog.events import FileSystemEventHandler, FileSystemEvent
6 from pathlib import Path
7 from typing import Union
8 from filelock import Timeout, FileLock
10 from logging import error, info, debug, warning
13 class ScannerWorkflowEvent(FileSystemEventHandler):
14 """Subclass of FileSystemEventHandler to handle OCRing PDFs"""
16 scanner_workflow = None
18 def __init__(self, scanner_workflow=None):
20 self.scanner_workflow = scanner_workflow
21 if not self.scanner_workflow:
22 raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
24 def on_closed(self, event: FileSystemEvent):
25 if event.is_directory:
27 if not event.src_path.endswith(".pdf"):
29 self.scanner_workflow.process_pdf(event.src_path)
32 class ScannerWorkflow:
39 ocrmypdf_opts = ["-r", "-q", "--deskew", "--clean"]
43 base_dir: Union[Path, str] = ".",
44 input_dir: Union[Path, str] = "input",
45 output_dir: Union[Path, str] = "output",
46 failure_dir: Union[Path, str] = "failure",
47 process_dir: Union[Path, str] = "process",
48 lock_file: Union[Path, str] = ".lock",
50 def concat_if_not_abs(dir1: Path, dir2: Path):
51 if dir2.is_absolute():
57 self.base_dir = Path(base_dir)
58 self.input_dir = concat_if_not_abs(self.base_dir, Path(input_dir))
59 self.output_dir = concat_if_not_abs(self.base_dir, Path(output_dir))
60 self.failure_dir = concat_if_not_abs(self.base_dir, Path(failure_dir))
61 self.process_dir = concat_if_not_abs(self.base_dir, Path(process_dir))
62 self.lock_file = concat_if_not_abs(self.base_dir, Path(lock_file))
63 self.lock = FileLock(self.lock_file)
64 self.base_dir.mkdir(parents=True, exist_ok=True)
65 self.input_dir.mkdir(parents=True, exist_ok=True)
66 self.failure_dir.mkdir(parents=True, exist_ok=True)
67 self.process_dir.mkdir(parents=True, exist_ok=True)
68 self.output_dir.mkdir(parents=True, exist_ok=True)
70 def process_pdf(self, pdf_file: Union[Path, str]):
71 """Process a single PDF."""
72 pdf_file = Path(pdf_file)
73 # move to the processing directory
74 pdf_file = pdf_file.rename(self.process_dir / pdf_file.name)
76 ["ocrmypdf", *self.ocrmypdf_opts, pdf_file, self.output_dir / pdf_file.name]
78 if res.returncode != 0:
79 error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
84 """Main event loop; called from the command line."""
85 ev = ScannerWorkflowEvent(scanner_workflow=self)
87 observer.schedule(ev, self.input_dir, recursive=True)
89 # process any PDFs in input_dir
90 for file in self.input_dir.iterdir():
91 self.process_pdf(file)
93 while observer.is_alive():
105 help="Directory to look for incoming PDFs",
111 help="Directory to store PDFs being processed",
117 help="Directory to output OCRed PDFs",
123 help="Directory to store failed PDFs",
129 help="Base directory",
135 help="Lock file to ensure only one instance is running",
137 def cli(input_dir, process_dir, output_dir, failure_dir, base_dir, lock_file):
138 """OCR scanner output and save in directory"""
139 sw = ScannerWorkflow(
141 process_dir=process_dir,
142 output_dir=output_dir,
143 failure_dir=failure_dir,
148 with sw.lock.acquire(timeout=10):
151 print("Another instance holds the lock")