|
@@ -4,26 +4,54 @@ import json
|
|
|
from pathlib import Path
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
-today = datetime.now()
|
|
|
-yesterday = today - timedelta(days=1)
|
|
|
-current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')]
|
|
|
-errors = {
|
|
|
- 'empty': [],
|
|
|
- 'outdated': []
|
|
|
-}
|
|
|
-
|
|
|
-files = [f for f in Path('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu').glob('*.pdf')]
|
|
|
-files_count = len(files)
|
|
|
-
|
|
|
-for i, f in enumerate(files):
|
|
|
- print(f'({i}/{files_count}) {f.name} ', end='\r')
|
|
|
- with pdfplumber.open(str(f)) as pdf:
|
|
|
- text = pdf.pages[0].extract_text()
|
|
|
- if re.search(r'\d+ von \d+$', text):
|
|
|
- errors['empty'].append(f.name)
|
|
|
- report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0])
|
|
|
- if report_date is not None and report_date.group() not in current_date:
|
|
|
- errors['outdated'].append([f.name, report_date.group()])
|
|
|
-
|
|
|
-print()
|
|
|
-print(json.dumps(errors, indent=2))
|
|
|
+
|
|
|
+def current_date_test(base_dir: str):
|
|
|
+ today = datetime.now()
|
|
|
+ yesterday = today - timedelta(days=1)
|
|
|
+ current_date = [today.strftime('%d.%m.%Y'), today.strftime('%d/%m/%Y'), yesterday.strftime('%d.%m.%Y'), yesterday.strftime('%d/%m/%Y')]
|
|
|
+ errors = {
|
|
|
+ 'empty': [],
|
|
|
+ 'outdated': []
|
|
|
+ }
|
|
|
+
|
|
|
+ files = [f for f in Path(base_dir).glob('*.pdf')]
|
|
|
+ files_count = len(files)
|
|
|
+
|
|
|
+ for i, f in enumerate(files):
|
|
|
+ print(f'({i}/{files_count}) {f.name} ', end='\r')
|
|
|
+ with pdfplumber.open(str(f)) as pdf:
|
|
|
+ text = pdf.pages[0].extract_text()
|
|
|
+ if re.search(r'\d+ von \d+$', text):
|
|
|
+ errors['empty'].append(f.name)
|
|
|
+ report_date = re.search(r'\d{2}[\./]\d{2}[\./]\d{4}', text.split('\n')[0])
|
|
|
+ if report_date is not None and report_date.group() not in current_date:
|
|
|
+ errors['outdated'].append([f.name, report_date.group()])
|
|
|
+
|
|
|
+ print()
|
|
|
+ print(json.dumps(errors, indent=2))
|
|
|
+
|
|
|
+
|
|
|
+def missing_data(base_dir: str) -> list[str]:
|
|
|
+ errors = []
|
|
|
+ for f in Path(base_dir).glob('*.pdf'):
|
|
|
+ with pdfplumber.open(str(f)) as pdf:
|
|
|
+ pages = len(pdf.pages)
|
|
|
+ text = pdf.pages[0].extract_text()
|
|
|
+
|
|
|
+ if pages < 2 and (re.search(r'Daten nicht verf', text) or (len(text) < 650 and text.count('\n') < 8)):
|
|
|
+ errors.append(f.name)
|
|
|
+ f.rename(base_dir + '/empty/' + f.name)
|
|
|
+ continue
|
|
|
+
|
|
|
+ with open(base_dir + '/plain/' + f.name + '.txt', 'wb') as fwh:
|
|
|
+ fwh.write(text.encode('utf8'))
|
|
|
+ return errors
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ # current_date_test('C:\\GAPS_BMW\\Portal\\Publish\\daten\\GAPS_BMW_neu')
|
|
|
+ missing_data('C:\\GlobalCube\\ReportOutput')
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|