pdf_test.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import os
  2. import pdfplumber
  3. from pdfminer.pdfparser import PDFSyntaxError
  4. import re
  5. import json
  6. from pathlib import Path
  7. from datetime import datetime, timedelta
  8. def current_date_test(base_dir: str):
  9. today = datetime.now()
  10. yesterday = today - timedelta(days=1)
  11. current_date = [
  12. today.strftime("%d.%m.%Y"),
  13. today.strftime("%d/%m/%Y"),
  14. yesterday.strftime("%d.%m.%Y"),
  15. yesterday.strftime("%d/%m/%Y"),
  16. ]
  17. errors = {"empty": [], "outdated": []}
  18. files = [f for f in Path(base_dir).glob("*.pdf")]
  19. files_count = len(files)
  20. for i, f in enumerate(files):
  21. print(f"({i}/{files_count}) {f.name} ", end="\r")
  22. with pdfplumber.open(str(f)) as pdf:
  23. text = pdf.pages[0].extract_text()
  24. if re.search(r"\d+ von \d+$", text):
  25. errors["empty"].append(f.name)
  26. report_date = re.search(r"\d{2}[\./]\d{2}[\./]\d{4}", text.split("\n")[0])
  27. if report_date is not None and report_date.group() not in current_date:
  28. errors["outdated"].append([f.name, report_date.group()])
  29. print()
  30. print(json.dumps(errors, indent=2))
  31. def missing_data(base_dir: str):
  32. for f in Path(base_dir).iterdir():
  33. if f.is_dir() and f.name not in [".", "..", "leer"]:
  34. missing_data(str(f))
  35. print(base_dir)
  36. os.makedirs(base_dir + "/leer", exist_ok=True)
  37. errors = []
  38. for f in Path(base_dir).glob("*.pdf"):
  39. try:
  40. with pdfplumber.open(str(f)) as pdf:
  41. pages = len(pdf.pages)
  42. text = pdf.pages[0].extract_text()
  43. except PDFSyntaxError:
  44. pages = 0
  45. text = ""
  46. # print(text)
  47. if pages < 2 and any(
  48. [
  49. re.search(r"Daten nicht verf", text),
  50. text.endswith("unprod.abw."),
  51. (len(text) < 650 and text.count("\n") < 8),
  52. ]
  53. ):
  54. errors.append(f.name)
  55. target = Path(base_dir + "/leer/" + f.name)
  56. target.unlink(missing_ok=True)
  57. f.rename(target)
  58. return errors
  59. if __name__ == "__main__":
  60. missing_data("C:\\GlobalCube\\ReportOutput")