)
return name
+ def pdf_file_path(self, name: str):
+ res = re.match(
+ r"(?P<scanner>[^_]+)_(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{4})_"
+ r"(?P<time>\d+)_(?P<counter>\d+)\.pdf",
+ str(name),
+ )
+ if res:
+ return f"{res.group('year')}/{res.group('month')}_{res.group('day')}"
+ return ""
+
def process_pdf(self, pdf_file: Union[Path, str]):
"""Process a single PDF."""
pdf_file = Path(pdf_file)
orig_pdf = pdf_file
# move to the processing directory
+ output_path = self.pdf_file_path(pdf_file.name)
pdf_file = pdf_file.rename(
self.process_dir / self.calculate_name(pdf_file.name)
)
- output_file = self.output_dir / pdf_file.name
+ (self.output_dir / output_path).mkdir(parents=True, exist_ok=True)
+ output_file = self.output_dir / output_path / pdf_file.name
res = subprocess.run(["ocrmypdf", *self.ocrmypdf_opts, pdf_file, output_file])
if res.returncode != 0:
error(f"Unable to properly OCR pdf: {res.stdout} {res.stderr}")
return
pdf_file.unlink()
- info("Processed {orig_pdf} into {output_file}")
+ info(f"Processed {orig_pdf} into {output_file}")
def event_loop(self):
"""Main event loop; called from the command line."""