pdf_test.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. import pdfplumber
  2. import re
  3. import json
  4. from pathlib import Path
  5. from datetime import datetime, timedelta
  6. def current_date_test(base_dir: str):
  7. today = datetime.now()
  8. yesterday = today - timedelta(days=1)
  9. current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')]
  10. errors = {
  11. 'empty': [],
  12. 'outdated': []
  13. }
  14. files = [f for f in Path(base_dir).glob('*.pdf')]
  15. files_count = len(files)
  16. for i, f in enumerate(files):
  17. print(f'({i}/{files_count}) {f.name} ', end='\r')
  18. with pdfplumber.open(str(f)) as pdf:
  19. text = pdf.pages[0].extract_text()
  20. if re.search(r'\d+ von \d+$', text):
  21. errors['empty'].append(f.name)
  22. report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0])
  23. if report_date is not None and report_date.group() not in current_date:
  24. errors['outdated'].append([f.name, report_date.group()])
  25. print()
  26. print(json.dumps(errors, indent=2))
  27. def missing_data(base_dir: str):
  28. errors = []
  29. for f in Path(base_dir).glob('*.pdf'):
  30. with pdfplumber.open(str(f)) as pdf:
  31. pages = len(pdf.pages)
  32. text = pdf.pages[0].extract_text()
  33. # print(text)
  34. if pages < 2 and (re.search(r'Daten nicht verf', text) or text.endswith('unprod.abw.') or (len(text) < 650 and text.count('\n') < 8)):
  35. errors.append(f.name)
  36. f.rename(base_dir + '/../leer/' + f.name)
  37. return errors
  38. def main():
  39. # current_date_test('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu')
  40. missing_data('C:\\GlobalCube\\ReportOutput\\raw')
  41. if __name__ == '__main__':
  42. main()