#!/usr/bin/env python3
+from __future__ import annotations
+
import click
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler, FileSystemEvent
import subprocess
from logging import error, info, debug, warning
import re
+from time import sleep
class ScannerWorkflowEvent(FileSystemEventHandler):
"""Subclass of FileSystemEventHandler to handle OCRing PDFs"""
- scanner_workflow = None
+ scanner_workflow: ScannerWorkflow
- def __init__(self, scanner_workflow=None):
+ def __init__(self, scanner_workflow: ScannerWorkflow):
super().__init__()
self.scanner_workflow = scanner_workflow
if not self.scanner_workflow:
- raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
+ raise Exception("No scanner_workflow passed to ScannerWorkflowEvent")
def on_any_event(self, event: FileSystemEvent):
if event.is_directory:
)
return name
+ def pdf_file_path(self, name: str):
+ res = re.match(
+ r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+ r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+ str(name),
+ )
+ if res:
+ return f"{res.group('year')}"
+ return ""
+
def process_pdf(self, pdf_file: Union[Path, str]):
"""Process a single PDF."""
pdf_file = Path(pdf_file)
orig_pdf = pdf_file
+ # check that the pdf is good, otherwise wait to see if it
+ # might become good
+ pdf_good = False
+ for i in range(1, 10):
+ check = subprocess.run(["qpdf", "--check", pdf_file])
+ if check.returncode == 0:
+ pdf_good = True
+ break
+ file_size = pdf_file.stat().st_size
+ # sleep in a loop for 10 seconds if the file size is still
+ # increasing
+ while True:
+ sleep(10)
+ new_size = pdf_file.stat().st_size
+ if new_size > file_size:
+ file_size = new_size
+ else:
+ break
+ if not pdf_good:
+ error(f"PDF was not good, skipping {orig_pdf} for now")
+ return
+
# move to the processing directory
+ output_path = self.pdf_file_path(pdf_file.name)
pdf_file = pdf_file.rename(
self.process_dir / self.calculate_name(pdf_file.name)
)
- output_file = self.output_dir / pdf_file.name
+ (self.output_dir / output_path).mkdir(parents=True, exist_ok=True)
+ output_file = self.output_dir / output_path / pdf_file.name
res = subprocess.run(["ocrmypdf", *self.ocrmypdf_opts, pdf_file, output_file])
if res.returncode != 0:
- error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
+ error(
+ f"Unable to properly OCR pdf {orig_pdf} into {output_file}: {res.stdout} {res.stderr}"
+ )
return
pdf_file.unlink()
- info("Processed {orig_pdf} into {output_file}")
+ info(f"Processed {orig_pdf} into {output_file}")
def event_loop(self):
"""Main event loop; called from the command line."""