pdf_test.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import pdfplumber
  2. import re
  3. import json
  4. from pathlib import Path
  5. from datetime import datetime, timedelta
  6. def current_date_test(base_dir: str):
  7. today = datetime.now()
  8. yesterday = today - timedelta(days=1)
  9. current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')]
  10. errors = {
  11. 'empty': [],
  12. 'outdated': []
  13. }
  14. files = [f for f in Path(base_dir).glob('*.pdf')]
  15. files_count = len(files)
  16. for i, f in enumerate(files):
  17. print(f'({i}/{files_count}) {f.name} ', end='\r')
  18. with pdfplumber.open(str(f)) as pdf:
  19. text = pdf.pages[0].extract_text()
  20. if re.search(r'\d+ von \d+$', text):
  21. errors['empty'].append(f.name)
  22. report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0])
  23. if report_date is not None and report_date.group() not in current_date:
  24. errors['outdated'].append([f.name, report_date.group()])
  25. print()
  26. print(json.dumps(errors, indent=2))
  27. def missing_data(base_dir: str) -> list[str]:
  28. errors = []
  29. for f in Path(base_dir).glob('*.pdf'):
  30. with pdfplumber.open(str(f)) as pdf:
  31. pages = len(pdf.pages)
  32. text = pdf.pages[0].extract_text()
  33. if pages < 2 and (re.search(r'Daten nicht verf', text) or (len(text) < 650 and text.count('\n') < 8)):
  34. errors.append(f.name)
  35. f.rename(base_dir + '/empty/' + f.name)
  36. continue
  37. with open(base_dir + '/plain/' + f.name + '.txt', 'wb') as fwh:
  38. fwh.write(text.encode('utf8'))
  39. return errors
  40. def main():
  41. # current_date_test('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu')
  42. missing_data('C:\\GlobalCube\\ReportOutput')
  43. if __name__ == '__main__':
  44. main()