import pdfplumber import re import json from pathlib import Path from datetime import datetime, timedelta def current_date_test(base_dir: str): today = datetime.now() yesterday = today - timedelta(days=1) current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')] errors = { 'empty': [], 'outdated': [] } files = [f for f in Path(base_dir).glob('*.pdf')] files_count = len(files) for i, f in enumerate(files): print(f'({i}/{files_count}) {f.name} ', end='\r') with pdfplumber.open(str(f)) as pdf: text = pdf.pages[0].extract_text() if re.search(r'\d+ von \d+$', text): errors['empty'].append(f.name) report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0]) if report_date is not None and report_date.group() not in current_date: errors['outdated'].append([f.name, report_date.group()]) print() print(json.dumps(errors, indent=2)) def missing_data(base_dir: str) -> list[str]: errors = [] for f in Path(base_dir).glob('*.pdf'): with pdfplumber.open(str(f)) as pdf: pages = len(pdf.pages) text = pdf.pages[0].extract_text() if pages < 2 and (re.search(r'Daten nicht verf', text) or (len(text) < 650 and text.count('\n') < 8)): errors.append(f.name) f.rename(base_dir + '/empty/' + f.name) continue with open(base_dir + '/plain/' + f.name + '.txt', 'wb') as fwh: fwh.write(text.encode('utf8')) return errors def main(): # current_date_test('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu') missing_data('C:\\GlobalCube\\ReportOutput') if __name__ == '__main__': main()