Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: 🐛 reintroduce df casting, ignore ints #221

Merged
merged 2 commits into from
Oct 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions sheetwork/core/adapters/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas

# ! temporarily deactivatiing df casting in pandas related to #205 & #204
# from core.utils import cast_pandas_dtypes
from sheetwork.core.utils import cast_pandas_dtypes
from sheetwork.core.adapters.base.impl import BaseSQLAdapter
from sheetwork.core.adapters.connection import SnowflakeConnection
from sheetwork.core.config.config import ConfigLoader
Expand Down Expand Up @@ -37,8 +37,8 @@ def close_connection(self) -> None:

def upload(self, df: pandas.DataFrame, override_schema: str = str()) -> None:
# cast columns
# ! temporarily deactivatiing df casting in pandas related to #205 & #204
# df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns)
# !: note integer conversion doesn't actually happen it is left as a str see #204, #205
df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns)
dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns)

# potenfially override target schema from config.
Expand Down
34 changes: 34 additions & 0 deletions sheetwork/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ColumnNotFoundInDataFrame,
DuplicatedColumnsInSheet,
NearestFileNotFound,
UnsupportedDataTypeError,
)
from sheetwork.core.logger import GLOBAL_LOGGER as logger
from sheetwork.core.ui.printer import yellow
Expand Down Expand Up @@ -127,3 +128,36 @@ def check_and_compare_version(external_version: Optional[str] = str()) -> bool:

except URLError:
return False


def cast_pandas_dtypes(df: pandas.DataFrame, overwrite_dict: dict = dict()) -> pandas.DataFrame:
overwrite_dict = overwrite_dict.copy()
dtypes_map = dict(
varchar="object",
# this is intentional in case of nulls. currently pandas doesn't play well with converting mixed types
# see https://github.com/bastienboutonnet/sheetwork/issues/204 for more details
int="object",
numeric="float64",
boolean="bool",
timestamp_ntz="datetime64[ns]",
date="datetime64[ns]", # this intentional pandas doesn't really have just dates.
)

# Check for type support
unsupported_dtypes = set(overwrite_dict.values()).difference(dtypes_map.keys())
if unsupported_dtypes:
raise UnsupportedDataTypeError(f"{unsupported_dtypes} are currently not supported")

# check overwrite col is in df
invalid_columns = set(overwrite_dict.keys()).difference(set(df.columns.tolist()))
if invalid_columns:
raise ColumnNotFoundInDataFrame(f"{invalid_columns} not in DataFrame. Check spelling?")

# recode dict in pandas terms
for col, data_type in overwrite_dict.items():
overwrite_dict.update({col: dtypes_map[data_type]})

# cast
df = df.astype(overwrite_dict)
logger.debug(f"Head of cast dataframe:\n {df.head()}")
return df
3 changes: 2 additions & 1 deletion tests/mockers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
}

CAST_DF = {
"col_int": {0: 1, 1: 2, 2: 32},
# this non conversion to int is intentional until we have a better fix see #205, #204
"col_int": {0: "1", 1: "2", 2: "32"},
"col_varchar": {0: "foo", 1: "bar", 2: "fizz"},
"created_date": {
0: Timestamp("2019-01-01 00:00:00"),
Expand Down
12 changes: 11 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .mockers import TO_CAST_DF, generate_test_df
from .mockers import CAST_DF, TO_CAST_DF, generate_test_df

CASTING_DICT = {
"col_int": "int",
Expand Down Expand Up @@ -36,3 +36,13 @@ def test_check_and_compare_version(mocker):
dummy_version = "0.0.0"
needs_update = check_and_compare_version(dummy_version)
assert needs_update is True


def test_cast_pandas_dtypes():
from sheetwork.core.utils import cast_pandas_dtypes

to_cast = generate_test_df(TO_CAST_DF)
cast_df = cast_pandas_dtypes(to_cast, CASTING_DICT)
expected_cast = generate_test_df(CAST_DF)

assert cast_df.to_dict() == expected_cast.to_dict()