import os import pdfplumber from pdfminer.pdfparser import PDFSyntaxError import re import json from pathlib import Path from datetime import datetime, timedelta def current_date_test(base_dir: str): today = datetime.now() yesterday = today - timedelta(days=1) current_date = [ today.strftime("%d.%m.%Y"), today.strftime("%d/%m/%Y"), yesterday.strftime("%d.%m.%Y"), yesterday.strftime("%d/%m/%Y"), ] errors = {"empty": [], "outdated": []} files = [f for f in Path(base_dir).glob("*.pdf")] files_count = len(files) for i, f in enumerate(files): print(f"({i}/{files_count}) {f.name} ", end="\r") with pdfplumber.open(str(f)) as pdf: text = pdf.pages[0].extract_text() if re.search(r"\d+ von \d+$", text): errors["empty"].append(f.name) report_date = re.search(r"\d{2}[\./]\d{2}[\./]\d{4}", text.split("\n")[0]) if report_date is not None and report_date.group() not in current_date: errors["outdated"].append([f.name, report_date.group()]) print() print(json.dumps(errors, indent=2)) def missing_data(base_dir: str): for f in Path(base_dir).iterdir(): if f.is_dir() and f.name not in [".", "..", "leer"]: missing_data(str(f)) print(base_dir) os.makedirs(base_dir + "/leer", exist_ok=True) errors = [] for f in Path(base_dir).glob("*.pdf"): try: with pdfplumber.open(str(f)) as pdf: pages = len(pdf.pages) text = pdf.pages[0].extract_text() except PDFSyntaxError: pages = 0 text = "" # print(text) if pages < 2 and any( [ re.search(r"Daten nicht verf", text), text.endswith("unprod.abw."), (len(text) < 650 and text.count("\n") < 8), ] ): errors.append(f.name) target = Path(base_dir + "/leer/" + f.name) target.unlink(missing_ok=True) f.rename(target) return errors if __name__ == "__main__": missing_data("C:\\GlobalCube\\ReportOutput")