1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495 |
- import os
- import re
- from datetime import datetime
- from pathlib import Path
- import plac
- MIN_AGE = datetime.now().timestamp() - 12 * 60 * 60
- def csv_cleanup(dirname: str = "misc/data"):
- if Path(dirname).is_file():
- csv_cleanup_file(Path(dirname))
- else:
- for csv_file in Path(dirname).glob("*.csv"):
- csv_cleanup_file(csv_file)
- def csv_cleanup_file(csv_file: Path):
- temp_file = Path(str(csv_file) + ".tmp")
- file_mtime = csv_file.stat().st_mtime
- if file_mtime < MIN_AGE:
- return
- print(csv_file.name)
- with open(csv_file, "r", encoding="latin-1") as frh:
- with open(temp_file, "w", encoding="latin-1") as fwh:
- header = frh.readline()
- fwh.write(header)
- sep_count = header.count(";")
- buffer = ""
- buffer_count = 0
- for line in frh.readlines():
- line = line.replace("\t", "")
- current_count = line.count(";")
- if buffer_count + current_count == sep_count:
- if buffer == "":
- fwh.write(line)
- continue
- fwh.write(buffer + line)
- print("*", end="")
- buffer = ""
- buffer_count = 0
- elif buffer_count + current_count < sep_count:
- buffer += re.subn(r"[\r\n]+", "", line)[0]
- buffer_count = buffer.count(";")
- else:
- line2 = cleanup_line(buffer + line)
- if line2.count(";") == sep_count:
- fwh.write(line2)
- else:
- print(f"Error: {line2}")
- buffer = ""
- buffer_count = 0
- print("")
- os.utime(temp_file, (file_mtime, file_mtime))
- csv_file.unlink()
- temp_file.rename(csv_file)
- def cleanup_line(line):
- res = []
- buffer = ""
- for col in line.strip("\r\n").split(";"):
- if '"' not in col:
- res.append(col)
- continue
- if col == '"':
- # special case
- if buffer == "":
- buffer = col
- else:
- res.append(buffer + col)
- buffer = ""
- continue
- if col[0] == '"' and col[-1] == '"':
- res.append(col)
- continue
- if col[0] == '"':
- buffer = col
- continue
- if col[-1] == '"':
- res.append(buffer + col)
- buffer = ""
- continue
- return ";".join(res) + "\n"
- if __name__ == "__main__":
- plac.call(csv_cleanup)
- # csv_cleanup("C:\\Users\\GAPS\\Desktop\\GuV_8_O21_csv.csv")
|