Skip to content

Commit

Permalink
fix: 🐛 reintroduce df casting, ignore ints (#221)
Browse files Browse the repository at this point in the history
  • Loading branch information
bastienboutonnet authored Oct 11, 2020
1 parent 8c6568a commit e0be0d8
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 5 deletions.
6 changes: 3 additions & 3 deletions sheetwork/core/adapters/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas

# ! temporarily deactivatiing df casting in pandas related to #205 & #204
# from core.utils import cast_pandas_dtypes
from sheetwork.core.utils import cast_pandas_dtypes
from sheetwork.core.adapters.base.impl import BaseSQLAdapter
from sheetwork.core.adapters.connection import SnowflakeConnection
from sheetwork.core.config.config import ConfigLoader
Expand Down Expand Up @@ -37,8 +37,8 @@ def close_connection(self) -> None:

def upload(self, df: pandas.DataFrame, override_schema: str = str()) -> None:
# cast columns
# ! temporarily deactivatiing df casting in pandas related to #205 & #204
# df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns)
# !: note integer conversion doesn't actually happen it is left as a str see #204, #205
df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns)
dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns)

# potenfially override target schema from config.
Expand Down
34 changes: 34 additions & 0 deletions sheetwork/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ColumnNotFoundInDataFrame,
DuplicatedColumnsInSheet,
NearestFileNotFound,
UnsupportedDataTypeError,
)
from sheetwork.core.logger import GLOBAL_LOGGER as logger
from sheetwork.core.ui.printer import yellow
Expand Down Expand Up @@ -127,3 +128,36 @@ def check_and_compare_version(external_version: Optional[str] = str()) -> bool:

except URLError:
return False


def cast_pandas_dtypes(df: pandas.DataFrame, overwrite_dict: dict = dict()) -> pandas.DataFrame:
overwrite_dict = overwrite_dict.copy()
dtypes_map = dict(
varchar="object",
# this is intentional in case of nulls. currently pandas doesn't play well with converting mixed types
# see https://github.com/bastienboutonnet/sheetwork/issues/204 for more details
int="object",
numeric="float64",
boolean="bool",
timestamp_ntz="datetime64[ns]",
date="datetime64[ns]", # this intentional pandas doesn't really have just dates.
)

# Check for type support
unsupported_dtypes = set(overwrite_dict.values()).difference(dtypes_map.keys())
if unsupported_dtypes:
raise UnsupportedDataTypeError(f"{unsupported_dtypes} are currently not supported")

# check overwrite col is in df
invalid_columns = set(overwrite_dict.keys()).difference(set(df.columns.tolist()))
if invalid_columns:
raise ColumnNotFoundInDataFrame(f"{invalid_columns} not in DataFrame. Check spelling?")

# recode dict in pandas terms
for col, data_type in overwrite_dict.items():
overwrite_dict.update({col: dtypes_map[data_type]})

# cast
df = df.astype(overwrite_dict)
logger.debug(f"Head of cast dataframe:\n {df.head()}")
return df
3 changes: 2 additions & 1 deletion tests/mockers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
}

CAST_DF = {
"col_int": {0: 1, 1: 2, 2: 32},
# this non conversion to int is intentional until we have a better fix see #205, #204
"col_int": {0: "1", 1: "2", 2: "32"},
"col_varchar": {0: "foo", 1: "bar", 2: "fizz"},
"created_date": {
0: Timestamp("2019-01-01 00:00:00"),
Expand Down
12 changes: 11 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .mockers import TO_CAST_DF, generate_test_df
from .mockers import CAST_DF, TO_CAST_DF, generate_test_df

CASTING_DICT = {
"col_int": "int",
Expand Down Expand Up @@ -36,3 +36,13 @@ def test_check_and_compare_version(mocker):
dummy_version = "0.0.0"
needs_update = check_and_compare_version(dummy_version)
assert needs_update is True


def test_cast_pandas_dtypes():
from sheetwork.core.utils import cast_pandas_dtypes

to_cast = generate_test_df(TO_CAST_DF)
cast_df = cast_pandas_dtypes(to_cast, CASTING_DICT)
expected_cast = generate_test_df(CAST_DF)

assert cast_df.to_dict() == expected_cast.to_dict()

0 comments on commit e0be0d8

Please sign in to comment.