from filelock import Timeout, FileLock
import subprocess
from logging import error, info, debug, warning
+import re
+from time import sleep
class ScannerWorkflowEvent(FileSystemEventHandler):
if not self.scanner_workflow:
raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
- def on_closed(self, event: FileSystemEvent):
+ def on_any_event(self, event: FileSystemEvent):
if event.is_directory:
return
if not event.src_path.endswith(".pdf"):
return
- self.scanner_workflow.process_pdf(event.src_path)
+ pdf_file = Path(event.src_path)
+ if pdf_file.exists():
+ self.scanner_workflow.process_pdf(pdf_file)
class ScannerWorkflow:
self.process_dir.mkdir(parents=True, exist_ok=True)
self.output_dir.mkdir(parents=True, exist_ok=True)
+ def calculate_name(self, name: str):
+ res = re.match(
+ r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+ r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+ str(name),
+ )
+ if res:
+ name = (
+ f"{res.group('scanner')}_"
+ f"{res.group('year')}{res.group('month')}{res.group('day')}_"
+ f"{res.group('time')}_{res.group('counter')}.pdf"
+ )
+ return name
+
+ def pdf_file_path(self, name: str):
+ res = re.match(
+ r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+ r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+ str(name),
+ )
+ if res:
+ return f"{res.group('year')}/{res.group('month')}_{res.group('day')}"
+ return ""
+
def process_pdf(self, pdf_file: Union[Path, str]):
"""Process a single PDF."""
pdf_file = Path(pdf_file)
+ orig_pdf = pdf_file
+ # check that the pdf is good, otherwise wait to see if it
+ # might become good
+ pdf_good = False
+ for i in range(1, 10):
+ check = subprocess.run(["qpdf", "--check", pdf_file])
+ if check.returncode == 0:
+ pdf_good = True
+ break
+ # sleep for 10 seconds if the PDF was bad
+ sleep(10)
+ if not pdf_good:
+ error(f"PDF was not good, skipping {orig_pdf} for now")
+ return
+
# move to the processing directory
- pdf_file = pdf_file.rename(self.process_dir / pdf_file.name)
- res = subprocess.run(
- ["ocrmypdf", *self.ocrmypdf_opts, pdf_file, self.output_dir / pdf_file.name]
+ output_path = self.pdf_file_path(pdf_file.name)
+ pdf_file = pdf_file.rename(
+ self.process_dir / self.calculate_name(pdf_file.name)
)
+ (self.output_dir / output_path).mkdir(parents=True, exist_ok=True)
+ output_file = self.output_dir / output_path / pdf_file.name
+ res = subprocess.run(["ocrmypdf", *self.ocrmypdf_opts, pdf_file, output_file])
if res.returncode != 0:
- error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
+ error(
+ f"Unable to properly OCR pdf {orig_pdf} into {output_file}: {res.stdout} {res.stderr}"
+ )
return
pdf_file.unlink()
+ info(f"Processed {orig_pdf} into {output_file}")
def event_loop(self):
"""Main event loop; called from the command line."""