pdf_test.py 1023 B

1234567891011121314151617181920212223242526272829
  1. import pdfplumber
  2. import re
  3. import json
  4. from pathlib import Path
  5. from datetime import datetime, timedelta
  6. today = datetime.now()
  7. yesterday = today - timedelta(days=1)
  8. current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')]
  9. errors = {
  10. 'empty': [],
  11. 'outdated': []
  12. }
  13. files = [f for f in Path('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu').glob('*.pdf')]
  14. files_count = len(files)
  15. for i, f in enumerate(files):
  16. print(f'({i}/{files_count}) {f.name} ', end='\r')
  17. with pdfplumber.open(str(f)) as pdf:
  18. text = pdf.pages[0].extract_text()
  19. if re.search(r'\d+ von \d+$', text):
  20. errors['empty'].append(f.name)
  21. report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0])
  22. if report_date is not None and report_date.group() not in current_date:
  23. errors['outdated'].append([f.name, report_date.group()])
  24. print()
  25. print(json.dumps(errors, indent=2))