123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- import os
- import re
- from datetime import datetime
- from pathlib import Path
- import plac
- MIN_AGE = datetime.now().timestamp() - 12 * 60 * 60
- def csv_cleanup(dirname: str = "misc/data"):
- if Path(dirname).is_file():
- csv_cleanup_file(Path(dirname))
- else:
- for csv_file in Path(dirname).glob("*.csv"):
- csv_cleanup_file(csv_file)
- def csv_cleanup_file(csv_file: Path):
- temp_file = Path(str(csv_file) + ".tmp")
- error_file = Path(str(csv_file) + ".err")
- file_mtime = csv_file.stat().st_mtime
- if file_mtime < MIN_AGE:
- return
- print(csv_file.name)
- with open(csv_file, "r", encoding="latin-1") as frh:
- with open(temp_file, "w", encoding="latin-1") as fwh:
- with open(error_file, "w", encoding="latin-1") as ewh:
- header = frh.readline()
- fwh.write(header)
- ewh.write(header)
- sep_count = header.count(";")
- buffer = ""
- buffer_count = 0
- solved_count = 0
- error_count = 0
- for line in frh.readlines():
- line = line.replace("\t", "")
- current_count = line.count(";")
- if buffer_count + current_count == sep_count:
- if buffer == "":
- fwh.write(line)
- continue
- fwh.write(buffer + line)
- print("*", end="")
- solved_count += 1
- buffer = ""
- buffer_count = 0
- elif buffer_count + current_count < sep_count:
- buffer += re.subn(r"[\r\n]+", "", line)[0]
- buffer_count = buffer.count(";")
- else:
- line2 = cleanup_line(buffer + line)
- if line2.count(";") == sep_count:
- fwh.write(line2)
- else:
- ewh.write(line2)
- print("-", end="")
- error_count += 1
- buffer = ""
- buffer_count = 0
- if error_count + solved_count > 0:
- print("")
- if error_count > 0:
- print(f"!! Anzahl Fehler: {error_count} !!")
- os.utime(temp_file, (file_mtime, file_mtime))
- csv_file.unlink()
- temp_file.rename(csv_file)
- def cleanup_line(line):
- res = []
- buffer = ""
- for col in line.strip("\r\n").split(";"):
- if '"' not in col:
- res.append(col)
- continue
- if col == '"':
-
- if buffer == "":
- buffer = col
- else:
- res.append(buffer + col)
- buffer = ""
- continue
- if col[0] == '"' and col[-1] == '"':
- res.append(col)
- continue
- if col[0] == '"':
- buffer = col
- continue
- if col[-1] == '"':
- res.append(buffer + col)
- buffer = ""
- continue
- return ";".join(res) + "\n"
- if __name__ == "__main__":
- plac.call(csv_cleanup)
-
|