12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- import os
- import pdfplumber
- import re
- import json
- from pathlib import Path
- from datetime import datetime, timedelta
- def current_date_test(base_dir: str):
- today = datetime.now()
- yesterday = today - timedelta(days=1)
- current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')]
- errors = {
- 'empty': [],
- 'outdated': []
- }
- files = [f for f in Path(base_dir).glob('*.pdf')]
- files_count = len(files)
- for i, f in enumerate(files):
- print(f'({i}/{files_count}) {f.name} ', end='\r')
- with pdfplumber.open(str(f)) as pdf:
- text = pdf.pages[0].extract_text()
- if re.search(r'\d+ von \d+$', text):
- errors['empty'].append(f.name)
- report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0])
- if report_date is not None and report_date.group() not in current_date:
- errors['outdated'].append([f.name, report_date.group()])
- print()
- print(json.dumps(errors, indent=2))
- def missing_data(base_dir: str):
- os.makedirs(base_dir + '/leer', exist_ok=True)
- errors = []
- for f in Path(base_dir).glob('*.pdf'):
- with pdfplumber.open(str(f)) as pdf:
- pages = len(pdf.pages)
- text = pdf.pages[0].extract_text()
- # print(text)
- if pages < 2 and any([
- re.search(r'Daten nicht verf', text),
- text.endswith('unprod.abw.'),
- (len(text) < 650 and text.count('\n') < 8)
- ]):
- errors.append(f.name)
- f.rename(base_dir + '/leer/' + f.name)
- return errors
- if __name__ == '__main__':
- # current_date_test('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu')
- missing_data('C:\\GlobalCube\\ReportOutput\\raw')
|