pdf_test.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. import os
  2. import pdfplumber
  3. import re
  4. import json
  5. from pathlib import Path
  6. from datetime import datetime, timedelta
  7. def current_date_test(base_dir: str):
  8. today = datetime.now()
  9. yesterday = today - timedelta(days=1)
  10. current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')]
  11. errors = {
  12. 'empty': [],
  13. 'outdated': []
  14. }
  15. files = [f for f in Path(base_dir).glob('*.pdf')]
  16. files_count = len(files)
  17. for i, f in enumerate(files):
  18. print(f'({i}/{files_count}) {f.name} ', end='\r')
  19. with pdfplumber.open(str(f)) as pdf:
  20. text = pdf.pages[0].extract_text()
  21. if re.search(r'\d+ von \d+$', text):
  22. errors['empty'].append(f.name)
  23. report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0])
  24. if report_date is not None and report_date.group() not in current_date:
  25. errors['outdated'].append([f.name, report_date.group()])
  26. print()
  27. print(json.dumps(errors, indent=2))
  28. def missing_data(base_dir: str):
  29. os.makedirs(base_dir + '/leer', exist_ok=True)
  30. errors = []
  31. for f in Path(base_dir).glob('*.pdf'):
  32. with pdfplumber.open(str(f)) as pdf:
  33. pages = len(pdf.pages)
  34. text = pdf.pages[0].extract_text()
  35. # print(text)
  36. if pages < 2 and any([
  37. re.search(r'Daten nicht verf', text),
  38. text.endswith('unprod.abw.'),
  39. (len(text) < 650 and text.count('\n') < 8)
  40. ]):
  41. errors.append(f.name)
  42. f.rename(base_dir + '/leer/' + f.name)
  43. return errors
  44. if __name__ == '__main__':
  45. # current_date_test('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu')
  46. missing_data('C:\\GlobalCube\\ReportOutput\\raw')