]> git.donarmstrong.com Git - scanner_workflow.git/blob - scanner_workflow.py
add first draft of scanner workflow that calls ocrmypdf
[scanner_workflow.git] / scanner_workflow.py
1 #!/usr/bin/env python3
2
3 import click
4 from watchdog.observers import Observer
5 from watchdog.events import FileSystemEventHandler, FileSystemEvent
6 from pathlib import Path
7 from typing import Union
8 from filelock import Timeout, FileLock
9 import subprocess
10 from logging import error, info, debug, warning
11
12
13 class ScannerWorkflowEvent(FileSystemEventHandler):
14     """Subclass of FileSystemEventHandler to handle OCRing PDFs"""
15
16     scanner_workflow = None
17
18     def __init__(self, scanner_workflow=None):
19         super().__init__()
20         self.scanner_workflow = scanner_workflow
21         if not self.scanner_workflow:
22             raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
23
24     def on_closed(self, event: FileSystemEvent):
25         if event.is_directory:
26             return
27         if not event.src_path.endswith(".pdf"):
28             return
29         self.scanner_workflow.process_pdf(event.src_path)
30
31
32 class ScannerWorkflow:
33     base_dir = None
34     failure_dir = None
35     output_dir = None
36     lock_file = None
37     input_dir = None
38     process_dir = None
39     ocrmypdf_opts = ["-r", "-q", "--deskew", "--clean"]
40
41     def __init__(
42         self,
43         base_dir: Union[Path, str] = ".",
44         input_dir: Union[Path, str] = "input",
45         output_dir: Union[Path, str] = "output",
46         failure_dir: Union[Path, str] = "failure",
47         process_dir: Union[Path, str] = "process",
48         lock_file: Union[Path, str] = ".lock",
49     ):
50         def concat_if_not_abs(dir1: Path, dir2: Path):
51             if dir2.is_absolute():
52                 return dir2
53             else:
54                 return dir1 / dir2
55
56         super().__init__()
57         self.base_dir = Path(base_dir)
58         self.input_dir = concat_if_not_abs(self.base_dir, Path(input_dir))
59         self.output_dir = concat_if_not_abs(self.base_dir, Path(output_dir))
60         self.failure_dir = concat_if_not_abs(self.base_dir, Path(failure_dir))
61         self.process_dir = concat_if_not_abs(self.base_dir, Path(process_dir))
62         self.lock_file = concat_if_not_abs(self.base_dir, Path(lock_file))
63         self.lock = FileLock(self.lock_file)
64         self.base_dir.mkdir(parents=True, exist_ok=True)
65         self.input_dir.mkdir(parents=True, exist_ok=True)
66         self.failure_dir.mkdir(parents=True, exist_ok=True)
67         self.process_dir.mkdir(parents=True, exist_ok=True)
68         self.output_dir.mkdir(parents=True, exist_ok=True)
69
70     def process_pdf(self, pdf_file: Union[Path, str]):
71         """Process a single PDF."""
72         pdf_file = Path(pdf_file)
73         # move to the processing directory
74         pdf_file = pdf_file.rename(self.process_dir / pdf_file.name)
75         res = subprocess.run(
76             ["ocrmypdf", *self.ocrmypdf_opts, pdf_file, self.output_dir / pdf_file.name]
77         )
78         if res.returncode != 0:
79             error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
80             return
81         pdf_file.unlink()
82
83     def event_loop(self):
84         """Main event loop; called from the command line."""
85         ev = ScannerWorkflowEvent(scanner_workflow=self)
86         observer = Observer()
87         observer.schedule(ev, self.input_dir, recursive=True)
88         observer.start()
89         # process any PDFs in input_dir
90         for file in self.input_dir.iterdir():
91             self.process_pdf(file)
92         try:
93             while observer.is_alive():
94                 observer.join(1)
95         finally:
96             observer.stop()
97             observer.join()
98
99
100 @click.command()
101 @click.option(
102     "-i",
103     "--input-dir",
104     default="input",
105     help="Directory to look for incoming PDFs",
106 )
107 @click.option(
108     "-p",
109     "--process-dir",
110     default="process",
111     help="Directory to store PDFs being processed",
112 )
113 @click.option(
114     "-o",
115     "--output-dir",
116     default="output",
117     help="Directory to output OCRed PDFs",
118 )
119 @click.option(
120     "-f",
121     "--failure-dir",
122     default="failure",
123     help="Directory to store failed PDFs",
124 )
125 @click.option(
126     "-b",
127     "--base-dir",
128     default=".",
129     help="Base directory",
130 )
131 @click.option(
132     "-l",
133     "--lock-file",
134     default=".lock",
135     help="Lock file to ensure only one instance is running",
136 )
137 def cli(input_dir, process_dir, output_dir, failure_dir, base_dir, lock_file):
138     """OCR scanner output and save in directory"""
139     sw = ScannerWorkflow(
140         input_dir=input_dir,
141         process_dir=process_dir,
142         output_dir=output_dir,
143         failure_dir=failure_dir,
144         base_dir=base_dir,
145         lock_file=lock_file,
146     )
147     try:
148         with sw.lock.acquire(timeout=10):
149             sw.event_loop()
150     except Timeout:
151         print("Another instance holds the lock")
152         exit(1)
153
154
155 cli()