Skip to content

Commit

Permalink
fix: 🐛 disable df casting, drop '' cols from dupe check (#206)
Browse files Browse the repository at this point in the history
  • Loading branch information
bastienboutonnet authored Sep 4, 2020
1 parent 1114b95 commit 9b572ce
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 54 deletions.
3 changes: 1 addition & 2 deletions core/adapters/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from core.exceptions import DatabaseError, TableDoesNotExist
from core.logger import GLOBAL_LOGGER as logger
from core.ui.printer import green, timed_message
from core.utils import cast_pandas_dtypes

if TYPE_CHECKING:
from core.adapters.connection import Connection
Expand Down Expand Up @@ -46,7 +45,7 @@ def sqlalchemy_dtypes(dtypes_dict) -> dict:

def upload(self, df: pandas.DataFrame, override_schema: str = str()):
# cast columns
df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns)
# ! temporarily deactivatiing df casting in pandas related to #205 & #204
dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns)

# potenfially override target schema from config.
Expand Down
47 changes: 8 additions & 39 deletions core/utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
import collections
from pathlib import Path
from typing import Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
from urllib.error import URLError

import luddite
import pandas
from packaging.version import parse as semver_parse

from core._version import __version__
from core.exceptions import (
ColumnNotFoundInDataFrame,
DuplicatedColumnsInSheet,
NearestFileNotFound,
UnsupportedDataTypeError,
)
from core.exceptions import ColumnNotFoundInDataFrame, DuplicatedColumnsInSheet, NearestFileNotFound
from core.logger import GLOBAL_LOGGER as logger
from core.ui.printer import yellow

Expand Down Expand Up @@ -70,37 +65,6 @@ def find_nearest_dir_and_file(
)


def cast_pandas_dtypes(df: pandas.DataFrame, overwrite_dict: dict = dict()) -> pandas.DataFrame:
overwrite_dict = overwrite_dict.copy()
dtypes_map = dict(
varchar="object",
int="int64",
numeric="float64",
boolean="bool",
timestamp_ntz="datetime64[ns]",
date="datetime64[ns]", # this intentional pandas doesn't really have just dates.
)

# Check for type support
unsupported_dtypes = set(overwrite_dict.values()).difference(dtypes_map.keys())
if unsupported_dtypes:
raise UnsupportedDataTypeError(f"{unsupported_dtypes} are currently not supported")

# check overwrite col is in df
invalid_columns = set(overwrite_dict.keys()).difference(set(df.columns.tolist()))
if invalid_columns:
raise ColumnNotFoundInDataFrame(f"{invalid_columns} not in DataFrame. Check spelling?")

# recode dict in pandas terms
for col, data_type in overwrite_dict.items():
overwrite_dict.update({col: dtypes_map[data_type]})

# cast
df = df.astype(overwrite_dict)
logger.debug(f"Head of cast dataframe:\n {df.head()}")
return df


def check_columns_in_df(
df: pandas.DataFrame,
columns: Union[list, str],
Expand All @@ -127,7 +91,12 @@ def check_columns_in_df(

def check_dupe_cols(columns: list, suppress_warning: bool = False) -> Optional[list]:
"""checks dupes in a list"""
dupes = [item for item, count in collections.Counter(columns).items() if count > 1]
columns_without_empty_strings: List[str] = list(filter(None, columns))
dupes = [
item
for item, count in collections.Counter(columns_without_empty_strings).items()
if count > 1
]
if dupes and not suppress_warning:
raise DuplicatedColumnsInSheet(
f"Duplicate column names found in Google Sheet: {dupes}. Aborting. Fix your sheet."
Expand Down
2 changes: 1 addition & 1 deletion pip-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ requests<2.23.0
gspread==3.3.0
sqlalchemy==1.3.16
cerberus==1.3.2
pandas==1.0.4
pandas==1.1.1
pyyaml==5.3.1
snowflake-sqlalchemy==1.2.3
oauth2client==4.1.3
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"gspread==3.3.0",
"sqlalchemy==1.3.16",
"cerberus==1.3.2",
"pandas==1.0.4",
"pandas==1.1.1",
"pyyaml==5.3.1",
"snowflake-sqlalchemy==1.2.3",
"oauth2client==4.1.3",
Expand Down
16 changes: 16 additions & 0 deletions tests/sheets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,19 @@ sheets:
- sheet_name: sheet_with_no_schema
sheet_key: sample
target_table: sample

- sheet_name: quality_assessment_scale
sheet_key: 1YyiUlZ4BNSADdO3LEz59gikEkmA3VmrktLOppgf-zqU
target_schema: ref_tables
target_table: quality_assessment_scale
columns:
- name: Needs_Improvement
datatype: int
- name: Acceptable
datatype: int
- name: Excellent
datatype: int
- name: N_A
datatype: int
- name: Fail
datatype: int
12 changes: 1 addition & 11 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .mockers import CAST_DF, TO_CAST_DF, generate_test_df
from .mockers import TO_CAST_DF, generate_test_df

CASTING_DICT = {
"col_int": "int",
Expand All @@ -7,16 +7,6 @@
}


def test_cast_pandas_dtypes():
from core.utils import cast_pandas_dtypes

to_cast = generate_test_df(TO_CAST_DF)
cast_df = cast_pandas_dtypes(to_cast, CASTING_DICT)
expected_cast = generate_test_df(CAST_DF)

assert cast_df.to_dict() == expected_cast.to_dict()


def test_check_columns_in_df():
from core.utils import check_columns_in_df

Expand Down

0 comments on commit 9b572ce

Please sign in to comment.