import os import re from datetime import datetime from pathlib import Path MIN_AGE = datetime.now().timestamp() - 12 * 60 * 60 def csv_cleanup(dirname: str = "misc/data"): if Path(dirname).is_file(): csv_cleanup_file(Path(dirname)) else: for csv_file in Path(dirname).glob("*.csv"): csv_cleanup_file(csv_file) def csv_cleanup_file(csv_file: Path): temp_file = Path(str(csv_file) + ".tmp") error_file = Path(str(csv_file) + ".err") file_mtime = csv_file.stat().st_mtime if file_mtime < MIN_AGE: return print(csv_file.name) with open(csv_file, "r", encoding="latin-1", errors="ignore") as frh: with open(temp_file, "w", encoding="latin-1") as fwh: with open(error_file, "w", encoding="latin-1") as ewh: header = frh.readline() fwh.write(header) ewh.write(header) sep_count = header.count(";") buffer = "" buffer_count = 0 solved_count = 0 error_count = 0 for line in frh.readlines(): line = line.replace("\t", "") current_count = line.count(";") if buffer_count + current_count == sep_count: if buffer == "": fwh.write(line) continue fwh.write(buffer + line) print("*", end="") solved_count += 1 buffer = "" buffer_count = 0 elif buffer_count + current_count < sep_count: buffer += re.subn(r"[\r\n]+", "", line)[0] buffer_count = buffer.count(";") else: line2 = cleanup_line(buffer + line) if line2.count(";") == sep_count: fwh.write(line2) else: ewh.write(line2) print("-", end="") error_count += 1 buffer = "" buffer_count = 0 if error_count + solved_count > 0: print("") os.utime(temp_file, (file_mtime, file_mtime)) if solved_count > 0: print(f"Zeilenverschiebungen behoben: {solved_count}") if error_count == 0: error_file.unlink() else: print(f"!! Anzahl Fehler in '{csv_file.name}': {error_count} !!") csv_file.unlink() temp_file.rename(csv_file) def cleanup_line(line): res = [] buffer = "" line_iter = iter(line.strip("\r\n").split(";")) for col in line_iter: if '"' not in col: # no string res.append(col) continue buffer = col while buffer == '"' or buffer[-1] != '"': buffer += next(line_iter) res.append(buffer) return ";".join(res) + "\n"