]> git.donarmstrong.com Git - scanner_workflow.git/blobdiff - scanner_workflow.py
just use the year for the path
[scanner_workflow.git] / scanner_workflow.py
index be2b9db2ba7170a00896fcc6751b6d32d9cec8c1..1321b132380c752f0624dd9d71d6d756918b58a3 100755 (executable)
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from __future__ import annotations
+
 import click
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler, FileSystemEvent
@@ -9,18 +11,19 @@ from filelock import Timeout, FileLock
 import subprocess
 from logging import error, info, debug, warning
 import re
+from time import sleep
 
 
 class ScannerWorkflowEvent(FileSystemEventHandler):
     """Subclass of FileSystemEventHandler to handle OCRing PDFs"""
 
-    scanner_workflow = None
+    scanner_workflow: ScannerWorkflow
 
-    def __init__(self, scanner_workflow=None):
+    def __init__(self, scanner_workflow: ScannerWorkflow):
         super().__init__()
         self.scanner_workflow = scanner_workflow
         if not self.scanner_workflow:
-            raise Error("No scanner_workflow passed to ScannerWorkflowEvent")
+            raise Exception("No scanner_workflow passed to ScannerWorkflowEvent")
 
     def on_any_event(self, event: FileSystemEvent):
         if event.is_directory:
@@ -84,21 +87,57 @@ class ScannerWorkflow:
             )
         return name
 
+    def pdf_file_path(self, name: str):
+        res = re.match(
+            r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+            r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+            str(name),
+        )
+        if res:
+            return f"{res.group('year')}"
+        return ""
+
     def process_pdf(self, pdf_file: Union[Path, str]):
         """Process a single PDF."""
         pdf_file = Path(pdf_file)
         orig_pdf = pdf_file
+        # check that the pdf is good, otherwise wait to see if it
+        # might become good
+        pdf_good = False
+        for i in range(1, 10):
+            check = subprocess.run(["qpdf", "--check", pdf_file])
+            if check.returncode == 0:
+                pdf_good = True
+                break
+            file_size = pdf_file.stat().st_size
+            # sleep in a loop for 10 seconds if the file size is still
+            # increasing
+            while True:
+                sleep(10)
+                new_size = pdf_file.stat().st_size
+                if new_size > file_size:
+                    file_size = new_size
+                else:
+                    break
+        if not pdf_good:
+            error(f"PDF was not good, skipping {orig_pdf} for now")
+            return
+
         # move to the processing directory
+        output_path = self.pdf_file_path(pdf_file.name)
         pdf_file = pdf_file.rename(
             self.process_dir / self.calculate_name(pdf_file.name)
         )
-        output_file = self.output_dir / pdf_file.name
+        (self.output_dir / output_path).mkdir(parents=True, exist_ok=True)
+        output_file = self.output_dir / output_path / pdf_file.name
         res = subprocess.run(["ocrmypdf", *self.ocrmypdf_opts, pdf_file, output_file])
         if res.returncode != 0:
-            error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
+            error(
+                f"Unable to properly OCR pdf {orig_pdf} into {output_file}: {res.stdout} {res.stderr}"
+            )
             return
         pdf_file.unlink()
-        info("Processed {orig_pdf} into {output_file}")
+        info(f"Processed {orig_pdf} into {output_file}")
 
     def event_loop(self):
         """Main event loop; called from the command line."""