From e0be0d8610cb85dd433e745f2df9afea57624865 Mon Sep 17 00:00:00 2001 From: Bastien Boutonnet Date: Sun, 11 Oct 2020 13:40:03 +0200 Subject: [PATCH] fix: :bug: reintroduce df casting, ignore ints (#221) --- sheetwork/core/adapters/impl.py | 6 +++--- sheetwork/core/utils.py | 34 +++++++++++++++++++++++++++++++++ tests/mockers.py | 3 ++- tests/test_utils.py | 12 +++++++++++- 4 files changed, 50 insertions(+), 5 deletions(-) diff --git a/sheetwork/core/adapters/impl.py b/sheetwork/core/adapters/impl.py index 1122ed1d..17a6d9c0 100644 --- a/sheetwork/core/adapters/impl.py +++ b/sheetwork/core/adapters/impl.py @@ -4,7 +4,7 @@ import pandas # ! temporarily deactivatiing df casting in pandas related to #205 & #204 -# from core.utils import cast_pandas_dtypes +from sheetwork.core.utils import cast_pandas_dtypes from sheetwork.core.adapters.base.impl import BaseSQLAdapter from sheetwork.core.adapters.connection import SnowflakeConnection from sheetwork.core.config.config import ConfigLoader @@ -37,8 +37,8 @@ def close_connection(self) -> None: def upload(self, df: pandas.DataFrame, override_schema: str = str()) -> None: # cast columns - # ! temporarily deactivatiing df casting in pandas related to #205 & #204 - # df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns) + # !: note integer conversion doesn't actually happen it is left as a str see #204, #205 + df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns) dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns) # potenfially override target schema from config. diff --git a/sheetwork/core/utils.py b/sheetwork/core/utils.py index 4ad2a247..9ceb2d1a 100644 --- a/sheetwork/core/utils.py +++ b/sheetwork/core/utils.py @@ -12,6 +12,7 @@ ColumnNotFoundInDataFrame, DuplicatedColumnsInSheet, NearestFileNotFound, + UnsupportedDataTypeError, ) from sheetwork.core.logger import GLOBAL_LOGGER as logger from sheetwork.core.ui.printer import yellow @@ -127,3 +128,36 @@ def check_and_compare_version(external_version: Optional[str] = str()) -> bool: except URLError: return False + + +def cast_pandas_dtypes(df: pandas.DataFrame, overwrite_dict: dict = dict()) -> pandas.DataFrame: + overwrite_dict = overwrite_dict.copy() + dtypes_map = dict( + varchar="object", + # this is intentional in case of nulls. currently pandas doesn't play well with converting mixed types + # see https://github.com/bastienboutonnet/sheetwork/issues/204 for more details + int="object", + numeric="float64", + boolean="bool", + timestamp_ntz="datetime64[ns]", + date="datetime64[ns]", # this intentional pandas doesn't really have just dates. + ) + + # Check for type support + unsupported_dtypes = set(overwrite_dict.values()).difference(dtypes_map.keys()) + if unsupported_dtypes: + raise UnsupportedDataTypeError(f"{unsupported_dtypes} are currently not supported") + + # check overwrite col is in df + invalid_columns = set(overwrite_dict.keys()).difference(set(df.columns.tolist())) + if invalid_columns: + raise ColumnNotFoundInDataFrame(f"{invalid_columns} not in DataFrame. Check spelling?") + + # recode dict in pandas terms + for col, data_type in overwrite_dict.items(): + overwrite_dict.update({col: dtypes_map[data_type]}) + + # cast + df = df.astype(overwrite_dict) + logger.debug(f"Head of cast dataframe:\n {df.head()}") + return df diff --git a/tests/mockers.py b/tests/mockers.py index d4f5f279..42eb022a 100644 --- a/tests/mockers.py +++ b/tests/mockers.py @@ -58,7 +58,8 @@ } CAST_DF = { - "col_int": {0: 1, 1: 2, 2: 32}, + # this non conversion to int is intentional until we have a better fix see #205, #204 + "col_int": {0: "1", 1: "2", 2: "32"}, "col_varchar": {0: "foo", 1: "bar", 2: "fizz"}, "created_date": { 0: Timestamp("2019-01-01 00:00:00"), diff --git a/tests/test_utils.py b/tests/test_utils.py index 43eb16d4..baa2a1e5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,4 @@ -from .mockers import TO_CAST_DF, generate_test_df +from .mockers import CAST_DF, TO_CAST_DF, generate_test_df CASTING_DICT = { "col_int": "int", @@ -36,3 +36,13 @@ def test_check_and_compare_version(mocker): dummy_version = "0.0.0" needs_update = check_and_compare_version(dummy_version) assert needs_update is True + + +def test_cast_pandas_dtypes(): + from sheetwork.core.utils import cast_pandas_dtypes + + to_cast = generate_test_df(TO_CAST_DF) + cast_df = cast_pandas_dtypes(to_cast, CASTING_DICT) + expected_cast = generate_test_df(CAST_DF) + + assert cast_df.to_dict() == expected_cast.to_dict()