csv_cleanup.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import os
  2. import re
  3. from datetime import datetime
  4. from pathlib import Path
  5. MIN_AGE = datetime.now().timestamp() - 12 * 60 * 60
  6. def csv_cleanup(dirname: str = "misc/data"):
  7. if Path(dirname).is_file():
  8. csv_cleanup_file(Path(dirname))
  9. else:
  10. for csv_file in Path(dirname).glob("*.csv"):
  11. csv_cleanup_file(csv_file)
  12. def csv_cleanup_file(csv_file: Path):
  13. temp_file = Path(str(csv_file) + ".tmp")
  14. error_file = Path(str(csv_file) + ".err")
  15. file_mtime = csv_file.stat().st_mtime
  16. if file_mtime < MIN_AGE:
  17. return
  18. print(csv_file.name)
  19. with open(csv_file, "r", encoding="latin-1", errors="ignore") as frh:
  20. with open(temp_file, "w", encoding="latin-1") as fwh:
  21. with open(error_file, "w", encoding="latin-1") as ewh:
  22. header = frh.readline()
  23. fwh.write(header)
  24. ewh.write(header)
  25. sep_count = header.count(";")
  26. buffer = ""
  27. buffer_count = 0
  28. solved_count = 0
  29. error_count = 0
  30. for line in frh.readlines():
  31. line = line.replace("\t", "")
  32. current_count = line.count(";")
  33. if buffer_count + current_count == sep_count:
  34. if buffer == "":
  35. fwh.write(line)
  36. continue
  37. fwh.write(buffer + line)
  38. print("*", end="")
  39. solved_count += 1
  40. buffer = ""
  41. buffer_count = 0
  42. elif buffer_count + current_count < sep_count:
  43. buffer += re.subn(r"[\r\n]+", "", line)[0]
  44. buffer_count = buffer.count(";")
  45. else:
  46. line2 = cleanup_line(buffer + line)
  47. if line2.count(";") == sep_count:
  48. fwh.write(line2)
  49. else:
  50. ewh.write(line2)
  51. print("-", end="")
  52. error_count += 1
  53. buffer = ""
  54. buffer_count = 0
  55. if error_count + solved_count > 0:
  56. print("")
  57. os.utime(temp_file, (file_mtime, file_mtime))
  58. if solved_count > 0:
  59. print(f"Zeilenverschiebungen behoben: {solved_count}")
  60. if error_count == 0:
  61. error_file.unlink()
  62. else:
  63. print(f"!! Anzahl Fehler in '{csv_file.name}': {error_count} !!")
  64. csv_file.unlink()
  65. temp_file.rename(csv_file)
  66. def cleanup_line(line):
  67. res = []
  68. buffer = ""
  69. line_iter = iter(line.strip("\r\n").split(";"))
  70. for col in line_iter:
  71. if '"' not in col:
  72. # no string
  73. res.append(col)
  74. continue
  75. buffer = col
  76. while buffer == '"' or buffer[-1] != '"':
  77. buffer += next(line_iter)
  78. res.append(buffer)
  79. return ";".join(res) + "\n"