GlobalCube
/
gctools


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
							import re

import pandas as pd

data_types = {
    "int64": "varchar(20)",
    "float64": "decimal(18,8)",
}

int_values = ["Menge", "Anzahl"]
decimal_values = ["Betrag"]


def get_column_types(csv_file, header=None):
    skip = 1 if header else 0
    df = pd.read_csv(
        csv_file,
        sep=";",
        decimal=",",
        encoding="latin-1",
        nrows=2000,
        names=header,
        skiprows=skip,
    )

    col_types = dict([(c, get_type(df[c])) for c in df.columns])
    return col_types


def is_datetime(value: str):
    return re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}", value)


def is_date(value: str):
    return re.match(r"\d{4}-\d{2}-\d{2}", value)


def get_type(df_col: pd.Series) -> str:
    if str(df_col.dtype) == "float64":
        if all(df_col.isna()):
            return "varchar(255)"
        return "decimal(18,8)"

    if str(df_col.dtype) == "int64":
        for entry in decimal_values:
            if entry in df_col.name:
                return "decimal(18,8)"
        for entry in int_values:
            if entry in df_col.name:
                return "int"
        return "varchar(20)"

    if all([is_datetime(str(value)) for value in df_col]):
        return "datetime"

    if all([is_date(str(value)) for value in df_col]):
        return "date"

    max_len = max([len(str(value)) for value in df_col])
    if max_len < 15:
        return "varchar(20)"
    if max_len < 35:
        return "varchar(50)"
    if max_len < 85:
        return "varchar(100)"
    return "varchar(255)"


if __name__ == "__main__":
    get_column_types("C:\\GlobalCube_Entwicklung\\Export\\belege_eds_stk_ohne_service_aw.csv")