csv_cleanup.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. import os
  2. import re
  3. from datetime import datetime
  4. from pathlib import Path
  5. import plac
  6. MIN_AGE = datetime.now().timestamp() - 12 * 60 * 60
  7. def csv_cleanup(dirname: str = "misc/data"):
  8. if Path(dirname).is_file():
  9. csv_cleanup_file(Path(dirname))
  10. else:
  11. for csv_file in Path(dirname).glob("*.csv"):
  12. csv_cleanup_file(csv_file)
  13. def csv_cleanup_file(csv_file: Path):
  14. temp_file = Path(str(csv_file) + ".tmp")
  15. file_mtime = csv_file.stat().st_mtime
  16. if file_mtime < MIN_AGE:
  17. return
  18. print(csv_file.name)
  19. with open(csv_file, "r", encoding="latin-1") as frh:
  20. with open(temp_file, "w", encoding="latin-1") as fwh:
  21. header = frh.readline()
  22. fwh.write(header)
  23. sep_count = header.count(";")
  24. buffer = ""
  25. buffer_count = 0
  26. for line in frh.readlines():
  27. line = line.replace("\t", "")
  28. current_count = line.count(";")
  29. if buffer_count + current_count == sep_count:
  30. if buffer == "":
  31. fwh.write(line)
  32. continue
  33. fwh.write(buffer + line)
  34. print("*", end="")
  35. buffer = ""
  36. buffer_count = 0
  37. elif buffer_count + current_count < sep_count:
  38. buffer += re.subn(r"[\r\n]+", "", line)[0]
  39. buffer_count = buffer.count(";")
  40. else:
  41. line2 = cleanup_line(buffer + line)
  42. if line2.count(";") == sep_count:
  43. fwh.write(line2)
  44. else:
  45. print(f"Error: {line2}")
  46. buffer = ""
  47. buffer_count = 0
  48. print("")
  49. os.utime(temp_file, (file_mtime, file_mtime))
  50. csv_file.unlink()
  51. temp_file.rename(csv_file)
  52. def cleanup_line(line):
  53. res = []
  54. buffer = ""
  55. for col in line.strip("\r\n").split(";"):
  56. if '"' not in col:
  57. res.append(col)
  58. continue
  59. if col == '"':
  60. # special case
  61. if buffer == "":
  62. buffer = col
  63. else:
  64. res.append(buffer + col)
  65. buffer = ""
  66. continue
  67. if col[0] == '"' and col[-1] == '"':
  68. res.append(col)
  69. continue
  70. if col[0] == '"':
  71. buffer = col
  72. continue
  73. if col[-1] == '"':
  74. res.append(buffer + col)
  75. buffer = ""
  76. continue
  77. return ";".join(res) + "\n"
  78. if __name__ == "__main__":
  79. plac.call(csv_cleanup)
  80. # csv_cleanup("C:\\Users\\GAPS\\Desktop\\GuV_8_O21_csv.csv")