pdf_test.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import os
  2. import pdfplumber
  3. import re
  4. import json
  5. from pathlib import Path
  6. from datetime import datetime, timedelta
  7. def current_date_test(base_dir: str):
  8. today = datetime.now()
  9. yesterday = today - timedelta(days=1)
  10. current_date = [
  11. today.strftime("%d.%m.%Y"),
  12. today.strftime("%d/%m/%Y"),
  13. yesterday.strftime("%d.%m.%Y"),
  14. yesterday.strftime("%d/%m/%Y"),
  15. ]
  16. errors = {"empty": [], "outdated": []}
  17. files = [f for f in Path(base_dir).glob("*.pdf")]
  18. files_count = len(files)
  19. for i, f in enumerate(files):
  20. print(f"({i}/{files_count}) {f.name} ", end="\r")
  21. with pdfplumber.open(str(f)) as pdf:
  22. text = pdf.pages[0].extract_text()
  23. if re.search(r"\d+ von \d+$", text):
  24. errors["empty"].append(f.name)
  25. report_date = re.search(r"\d{2}[\./]\d{2}[\./]\d{4}", text.split("\n")[0])
  26. if report_date is not None and report_date.group() not in current_date:
  27. errors["outdated"].append([f.name, report_date.group()])
  28. print()
  29. print(json.dumps(errors, indent=2))
  30. def missing_data(base_dir: str):
  31. os.makedirs(base_dir + "/leer", exist_ok=True)
  32. errors = []
  33. for f in Path(base_dir).glob("*.pdf"):
  34. with pdfplumber.open(str(f)) as pdf:
  35. pages = len(pdf.pages)
  36. text = pdf.pages[0].extract_text()
  37. # print(text)
  38. if pages < 2 and any(
  39. [
  40. re.search(r"Daten nicht verf", text),
  41. text.endswith("unprod.abw."),
  42. (len(text) < 650 and text.count("\n") < 8),
  43. ]
  44. ):
  45. errors.append(f.name)
  46. target = Path(base_dir + "/leer/" + f.name)
  47. target.unlink(missing_ok=True)
  48. f.rename(target)
  49. return errors
  50. if __name__ == "__main__":
  51. # current_date_test('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu')
  52. missing_data("C:\\GlobalCube\\ReportOutput\\raw")