pdf_test.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import json
  2. import os
  3. import re
  4. from datetime import datetime, timedelta
  5. from pathlib import Path
  6. import pdfplumber
  7. from pdfminer.pdfparser import PDFSyntaxError
  8. from pdfplumber.utils.exceptions import PdfminerException
  9. def current_date_test(base_dir: str):
  10. today = datetime.now()
  11. yesterday = today - timedelta(days=1)
  12. current_date = [
  13. today.strftime("%d.%m.%Y"),
  14. today.strftime("%d/%m/%Y"),
  15. yesterday.strftime("%d.%m.%Y"),
  16. yesterday.strftime("%d/%m/%Y"),
  17. ]
  18. errors = {"empty": [], "outdated": []}
  19. files = [f for f in Path(base_dir).glob("*.pdf")]
  20. files_count = len(files)
  21. for i, f in enumerate(files):
  22. print(f"({i}/{files_count}) {f.name} ", end="\r")
  23. with pdfplumber.open(str(f)) as pdf:
  24. text = pdf.pages[0].extract_text()
  25. if re.search(r"\d+ von \d+$", text):
  26. errors["empty"].append(f.name)
  27. report_date = re.search(r"\d{2}[\./]\d{2}[\./]\d{4}", text.split("\n")[0])
  28. if report_date is not None and report_date.group() not in current_date:
  29. errors["outdated"].append([f.name, report_date.group()])
  30. print()
  31. print(json.dumps(errors, indent=2))
  32. def missing_data(base_dir: str):
  33. for f in Path(base_dir).iterdir():
  34. if f.is_dir() and f.name not in [".", "..", "leer", "_leer"]:
  35. missing_data(str(f))
  36. print(base_dir)
  37. try:
  38. Path(base_dir + "/leer").unlink(missing_ok=True)
  39. except Exception:
  40. pass
  41. os.makedirs(base_dir + "/_leer", exist_ok=True)
  42. errors = []
  43. for f in Path(base_dir).glob("*.pdf"):
  44. try:
  45. with pdfplumber.open(str(f)) as pdf:
  46. pages = len(pdf.pages)
  47. text = pdf.pages[0].extract_text()
  48. except (PDFSyntaxError, PdfminerException):
  49. pages = 0
  50. text = ""
  51. # print(text)
  52. if pages < 2 and any(
  53. [
  54. re.search(r"Daten nicht verf", text),
  55. text.endswith("unprod.abw."),
  56. (len(text) < 650 and text.count("\n") < 8),
  57. ]
  58. ):
  59. errors.append(f.name)
  60. target = Path(base_dir + "/_leer/" + f.name)
  61. target.unlink(missing_ok=True)
  62. f.rename(target)
  63. return errors
  64. if __name__ == "__main__":
  65. missing_data("C:\\GlobalCube\\ReportOutput")