check to make sure the pdf is good before processing

[scanner_workflow.git] / scanner_workflow.py
diff --git a/scanner_workflow.py b/scanner_workflow.py

index a65decc22b204f295f5e20b530fa066a99e05b9f..64514f20e56a392c067e0a1764aff1c358323a87 100755 (executable)
--- a/scanner_workflow.py
+++ b/scanner_workflow.py
@@ -8,6 +8,8 @@ from typing import Union
  from filelock import Timeout, FileLock
  import subprocess
  from logging import error, info, debug, warning
+import re
+from time import sleep
  
  
  class ScannerWorkflowEvent(FileSystemEventHandler):
@@ -21,12 +23,14 @@ class ScannerWorkflowEvent(FileSystemEventHandler):
          if not self.scanner_workflow:
              raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
  
-    def on_closed(self, event: FileSystemEvent):
+    def on_any_event(self, event: FileSystemEvent):
          if event.is_directory:
              return
          if not event.src_path.endswith(".pdf"):
              return
-        self.scanner_workflow.process_pdf(event.src_path)
+        pdf_file = Path(event.src_path)
+        if pdf_file.exists():
+            self.scanner_workflow.process_pdf(pdf_file)
  
  
  class ScannerWorkflow:
@@ -67,18 +71,63 @@ class ScannerWorkflow:
          self.process_dir.mkdir(parents=True, exist_ok=True)
          self.output_dir.mkdir(parents=True, exist_ok=True)
  
+    def calculate_name(self, name: str):
+        res = re.match(
+            r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+            r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+            str(name),
+        )
+        if res:
+            name = (
+                f"{res.group('scanner')}_"
+                f"{res.group('year')}{res.group('month')}{res.group('day')}_"
+                f"{res.group('time')}_{res.group('counter')}.pdf"
+            )
+        return name
+
+    def pdf_file_path(self, name: str):
+        res = re.match(
+            r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+            r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+            str(name),
+        )
+        if res:
+            return f"{res.group('year')}/{res.group('month')}_{res.group('day')}"
+        return ""
+
      def process_pdf(self, pdf_file: Union[Path, str]):
          """Process a single PDF."""
          pdf_file = Path(pdf_file)
+        orig_pdf = pdf_file
+        # check that the pdf is good, otherwise wait to see if it
+        # might become good
+        pdf_good = False
+        for i in range(1, 10):
+            check = subprocess.run(["qpdf", "--check", pdf_file])
+            if check.returncode == 0:
+                pdf_good = True
+                break
+            # sleep for 10 seconds if the PDF was bad
+            sleep(10)
+        if not pdf_good:
+            error(f"PDF was not good, skipping {orig_pdf} for now")
+            return
+
          # move to the processing directory
-        pdf_file = pdf_file.rename(self.process_dir / pdf_file.name)
-        res = subprocess.run(
-            ["ocrmypdf", *self.ocrmypdf_opts, pdf_file, self.output_dir / pdf_file.name]
+        output_path = self.pdf_file_path(pdf_file.name)
+        pdf_file = pdf_file.rename(
+            self.process_dir / self.calculate_name(pdf_file.name)
          )
+        (self.output_dir / output_path).mkdir(parents=True, exist_ok=True)
+        output_file = self.output_dir / output_path / pdf_file.name
+        res = subprocess.run(["ocrmypdf", *self.ocrmypdf_opts, pdf_file, output_file])
          if res.returncode != 0:
-            error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
+            error(
+                f"Unable to properly OCR pdf {orig_pdf} into {output_file}: {res.stdout} {res.stderr}"
+            )
              return
          pdf_file.unlink()
+        info(f"Processed {orig_pdf} into {output_file}")
  
      def event_loop(self):
          """Main event loop; called from the command line."""