123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- import os
- import pdfplumber
- from pdfminer.pdfparser import PDFSyntaxError
- import re
- import json
- from pathlib import Path
- from datetime import datetime, timedelta
- def current_date_test(base_dir: str):
- today = datetime.now()
- yesterday = today - timedelta(days=1)
- current_date = [
- today.strftime("%d.%m.%Y"),
- today.strftime("%d/%m/%Y"),
- yesterday.strftime("%d.%m.%Y"),
- yesterday.strftime("%d/%m/%Y"),
- ]
- errors = {"empty": [], "outdated": []}
- files = [f for f in Path(base_dir).glob("*.pdf")]
- files_count = len(files)
- for i, f in enumerate(files):
- print(f"({i}/{files_count}) {f.name} ", end="\r")
- with pdfplumber.open(str(f)) as pdf:
- text = pdf.pages[0].extract_text()
- if re.search(r"\d+ von \d+$", text):
- errors["empty"].append(f.name)
- report_date = re.search(r"\d{2}[\./]\d{2}[\./]\d{4}", text.split("\n")[0])
- if report_date is not None and report_date.group() not in current_date:
- errors["outdated"].append([f.name, report_date.group()])
- print()
- print(json.dumps(errors, indent=2))
- def missing_data(base_dir: str):
- for f in Path(base_dir).iterdir():
- if f.is_dir() and f.name not in [".", "..", "leer"]:
- missing_data(str(f))
- print(base_dir)
- os.makedirs(base_dir + "/leer", exist_ok=True)
- errors = []
- for f in Path(base_dir).glob("*.pdf"):
- try:
- with pdfplumber.open(str(f)) as pdf:
- pages = len(pdf.pages)
- text = pdf.pages[0].extract_text()
- except PDFSyntaxError:
- pages = 0
- text = ""
- # print(text)
- if pages < 2 and any(
- [
- re.search(r"Daten nicht verf", text),
- text.endswith("unprod.abw."),
- (len(text) < 650 and text.count("\n") < 8),
- ]
- ):
- errors.append(f.name)
- target = Path(base_dir + "/leer/" + f.name)
- target.unlink(missing_ok=True)
- f.rename(target)
- return errors
- if __name__ == "__main__":
- missing_data("C:\\GlobalCube\\ReportOutput")
|