]> git.donarmstrong.com Git - scanner_workflow.git/commitdiff
output into a path instead of the same directory
authorDon Armstrong <don@donarmstrong.com>
Fri, 24 Feb 2023 00:27:19 +0000 (16:27 -0800)
committerDon Armstrong <don@donarmstrong.com>
Fri, 24 Feb 2023 00:27:19 +0000 (16:27 -0800)
scanner_workflow.py

index be2b9db2ba7170a00896fcc6751b6d32d9cec8c1..e803f9ee1fa4da4e6f5df62922e0328df6d72011 100755 (executable)
@@ -84,21 +84,33 @@ class ScannerWorkflow:
             )
         return name
 
+    def pdf_file_path(self, name: str):
+        res = re.match(
+            r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+            r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+            str(name),
+        )
+        if res:
+            return f"{res.group('year')}/{res.group('month')}_{res.group('day')}"
+        return ""
+
     def process_pdf(self, pdf_file: Union[Path, str]):
         """Process a single PDF."""
         pdf_file = Path(pdf_file)
         orig_pdf = pdf_file
         # move to the processing directory
+        output_path = self.pdf_file_path(pdf_file.name)
         pdf_file = pdf_file.rename(
             self.process_dir / self.calculate_name(pdf_file.name)
         )
-        output_file = self.output_dir / pdf_file.name
+        (self.output_dir / output_path).mkdir(parents=True, exist_ok=True)
+        output_file = self.output_dir / output_path / pdf_file.name
         res = subprocess.run(["ocrmypdf", *self.ocrmypdf_opts, pdf_file, output_file])
         if res.returncode != 0:
             error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
             return
         pdf_file.unlink()
-        info("Processed {orig_pdf} into {output_file}")
+        info(f"Processed {orig_pdf} into {output_file}")
 
     def event_loop(self):
         """Main event loop; called from the command line."""