csv_cleanup.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import os
  2. import re
  3. from datetime import datetime
  4. from pathlib import Path
  5. import plac
  6. MIN_AGE = datetime.now().timestamp() - 12 * 60 * 60
  7. def csv_cleanup(dirname: str = "misc/data"):
  8. if Path(dirname).is_file():
  9. csv_cleanup_file(Path(dirname))
  10. else:
  11. for csv_file in Path(dirname).glob("*.csv"):
  12. csv_cleanup_file(csv_file)
  13. def csv_cleanup_file(csv_file: Path):
  14. temp_file = Path(str(csv_file) + ".tmp")
  15. error_file = Path(str(csv_file) + ".err")
  16. file_mtime = csv_file.stat().st_mtime
  17. if file_mtime < MIN_AGE:
  18. return
  19. print(csv_file.name)
  20. with open(csv_file, "r", encoding="latin-1") as frh:
  21. with open(temp_file, "w", encoding="latin-1") as fwh:
  22. with open(error_file, "w", encoding="latin-1") as ewh:
  23. header = frh.readline()
  24. fwh.write(header)
  25. ewh.write(header)
  26. sep_count = header.count(";")
  27. buffer = ""
  28. buffer_count = 0
  29. solved_count = 0
  30. error_count = 0
  31. for line in frh.readlines():
  32. line = line.replace("\t", "")
  33. current_count = line.count(";")
  34. if buffer_count + current_count == sep_count:
  35. if buffer == "":
  36. fwh.write(line)
  37. continue
  38. fwh.write(buffer + line)
  39. print("*", end="")
  40. solved_count += 1
  41. buffer = ""
  42. buffer_count = 0
  43. elif buffer_count + current_count < sep_count:
  44. buffer += re.subn(r"[\r\n]+", "", line)[0]
  45. buffer_count = buffer.count(";")
  46. else:
  47. line2 = cleanup_line(buffer + line)
  48. if line2.count(";") == sep_count:
  49. fwh.write(line2)
  50. else:
  51. ewh.write(line2)
  52. print("-", end="")
  53. error_count += 1
  54. buffer = ""
  55. buffer_count = 0
  56. if error_count + solved_count > 0:
  57. print("")
  58. if error_count > 0:
  59. print(f"!! Anzahl Fehler: {error_count} !!")
  60. os.utime(temp_file, (file_mtime, file_mtime))
  61. csv_file.unlink()
  62. temp_file.rename(csv_file)
  63. def cleanup_line(line):
  64. res = []
  65. buffer = ""
  66. for col in line.strip("\r\n").split(";"):
  67. if '"' not in col:
  68. res.append(col)
  69. continue
  70. if col == '"':
  71. # special case
  72. if buffer == "":
  73. buffer = col
  74. else:
  75. res.append(buffer + col)
  76. buffer = ""
  77. continue
  78. if col[0] == '"' and col[-1] == '"':
  79. res.append(col)
  80. continue
  81. if col[0] == '"':
  82. buffer = col
  83. continue
  84. if col[-1] == '"':
  85. res.append(buffer + col)
  86. buffer = ""
  87. continue
  88. return ";".join(res) + "\n"
  89. if __name__ == "__main__":
  90. plac.call(csv_cleanup)
  91. # csv_cleanup("C:\\Users\\GAPS\\Desktop\\GuV_8_O21_csv.csv")