12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- import os
- import re
- from datetime import datetime
- from pathlib import Path
- MIN_AGE = datetime.now().timestamp() - 12 * 60 * 60
- def csv_cleanup(dirname: str = "misc/data"):
- if Path(dirname).is_file():
- csv_cleanup_file(Path(dirname))
- else:
- for csv_file in Path(dirname).glob("*.csv"):
- csv_cleanup_file(csv_file)
- def csv_cleanup_file(csv_file: Path):
- temp_file = Path(str(csv_file) + ".tmp")
- error_file = Path(str(csv_file) + ".err")
- file_mtime = csv_file.stat().st_mtime
- if file_mtime < MIN_AGE:
- return
- print(csv_file.name)
- with open(csv_file, "r", encoding="latin-1", errors="ignore") as frh:
- with open(temp_file, "w", encoding="latin-1") as fwh:
- with open(error_file, "w", encoding="latin-1") as ewh:
- header = frh.readline()
- fwh.write(header)
- ewh.write(header)
- sep_count = header.count(";")
- buffer = ""
- buffer_count = 0
- solved_count = 0
- error_count = 0
- for line in frh.readlines():
- line = line.replace("\t", "")
- current_count = line.count(";")
- if buffer_count + current_count == sep_count:
- if buffer == "":
- fwh.write(line)
- continue
- fwh.write(buffer + line)
- print("*", end="")
- solved_count += 1
- buffer = ""
- buffer_count = 0
- elif buffer_count + current_count < sep_count:
- buffer += re.subn(r"[\r\n]+", "", line)[0]
- buffer_count = buffer.count(";")
- else:
- line2 = cleanup_line(buffer + line)
- if line2.count(";") == sep_count:
- fwh.write(line2)
- else:
- ewh.write(line2)
- print("-", end="")
- error_count += 1
- buffer = ""
- buffer_count = 0
- if error_count + solved_count > 0:
- print("")
- os.utime(temp_file, (file_mtime, file_mtime))
- if solved_count > 0:
- print(f"Zeilenverschiebungen behoben: {solved_count}")
- if error_count == 0:
- error_file.unlink()
- else:
- print(f"!! Anzahl Fehler in '{csv_file.name}': {error_count} !!")
- csv_file.unlink()
- temp_file.rename(csv_file)
- def cleanup_line(line):
- res = []
- buffer = ""
- line_iter = iter(line.strip("\r\n").split(";"))
- for col in line_iter:
- if '"' not in col:
- # no string
- res.append(col)
- continue
- buffer = col
- while buffer == '"' or buffer[-1] != '"':
- buffer += next(line_iter)
- res.append(buffer)
- return ";".join(res) + "\n"
|