csv_column_types.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import re
  2. import pandas as pd
  3. data_types = {
  4. "int64": "varchar(20)",
  5. "float64": "decimal(18,8)",
  6. }
  7. int_values = ["Menge", "Anzahl"]
  8. decimal_values = ["Betrag"]
  9. def get_column_types(csv_file, header=None):
  10. skip = 1 if header else 0
  11. df = pd.read_csv(
  12. csv_file,
  13. sep=";",
  14. decimal=",",
  15. encoding="latin-1",
  16. nrows=2000,
  17. names=header,
  18. skiprows=skip,
  19. )
  20. col_types = dict([(c, get_type(df[c])) for c in df.columns])
  21. return col_types
  22. def is_datetime(value: str):
  23. return re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}", value)
  24. def is_date(value: str):
  25. return re.match(r"\d{4}-\d{2}-\d{2}", value)
  26. def get_type(df_col: pd.Series) -> str:
  27. if str(df_col.dtype) == "float64":
  28. if all(df_col.isna()):
  29. return "varchar(255)"
  30. return "decimal(18,8)"
  31. if str(df_col.dtype) == "int64":
  32. for entry in decimal_values:
  33. if entry in df_col.name:
  34. return "decimal(18,8)"
  35. for entry in int_values:
  36. if entry in df_col.name:
  37. return "int"
  38. return "varchar(20)"
  39. if all([is_datetime(str(value)) for value in df_col]):
  40. return "datetime"
  41. if all([is_date(str(value)) for value in df_col]):
  42. return "date"
  43. max_len = max([len(str(value)) for value in df_col])
  44. if max_len < 15:
  45. return "varchar(20)"
  46. if max_len < 35:
  47. return "varchar(50)"
  48. if max_len < 85:
  49. return "varchar(100)"
  50. return "varchar(255)"
  51. if __name__ == "__main__":
  52. get_column_types("C:\\GlobalCube_Entwicklung\\Export\\belege_eds_stk_ohne_service_aw.csv")