From 9b572ce00f9e3ec37a8b6654de6bbd2601093826 Mon Sep 17 00:00:00 2001 From: Bastien Boutonnet Date: Fri, 4 Sep 2020 09:46:33 +0200 Subject: [PATCH] fix: :bug: disable df casting, drop `''` cols from dupe check (#206) --- core/adapters/impl.py | 3 +-- core/utils.py | 47 ++++++++----------------------------------- pip-requirements.txt | 2 +- setup.py | 2 +- tests/sheets.yml | 16 +++++++++++++++ tests/test_utils.py | 12 +---------- 6 files changed, 28 insertions(+), 54 deletions(-) diff --git a/core/adapters/impl.py b/core/adapters/impl.py index bb14b141..a51a1a28 100644 --- a/core/adapters/impl.py +++ b/core/adapters/impl.py @@ -8,7 +8,6 @@ from core.exceptions import DatabaseError, TableDoesNotExist from core.logger import GLOBAL_LOGGER as logger from core.ui.printer import green, timed_message -from core.utils import cast_pandas_dtypes if TYPE_CHECKING: from core.adapters.connection import Connection @@ -46,7 +45,7 @@ def sqlalchemy_dtypes(dtypes_dict) -> dict: def upload(self, df: pandas.DataFrame, override_schema: str = str()): # cast columns - df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns) + # ! temporarily deactivatiing df casting in pandas related to #205 & #204 dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns) # potenfially override target schema from config. diff --git a/core/utils.py b/core/utils.py index 73665d3f..6ea6d077 100644 --- a/core/utils.py +++ b/core/utils.py @@ -1,6 +1,6 @@ import collections from pathlib import Path -from typing import Optional, Tuple, Union +from typing import List, Optional, Tuple, Union from urllib.error import URLError import luddite @@ -8,12 +8,7 @@ from packaging.version import parse as semver_parse from core._version import __version__ -from core.exceptions import ( - ColumnNotFoundInDataFrame, - DuplicatedColumnsInSheet, - NearestFileNotFound, - UnsupportedDataTypeError, -) +from core.exceptions import ColumnNotFoundInDataFrame, DuplicatedColumnsInSheet, NearestFileNotFound from core.logger import GLOBAL_LOGGER as logger from core.ui.printer import yellow @@ -70,37 +65,6 @@ def find_nearest_dir_and_file( ) -def cast_pandas_dtypes(df: pandas.DataFrame, overwrite_dict: dict = dict()) -> pandas.DataFrame: - overwrite_dict = overwrite_dict.copy() - dtypes_map = dict( - varchar="object", - int="int64", - numeric="float64", - boolean="bool", - timestamp_ntz="datetime64[ns]", - date="datetime64[ns]", # this intentional pandas doesn't really have just dates. - ) - - # Check for type support - unsupported_dtypes = set(overwrite_dict.values()).difference(dtypes_map.keys()) - if unsupported_dtypes: - raise UnsupportedDataTypeError(f"{unsupported_dtypes} are currently not supported") - - # check overwrite col is in df - invalid_columns = set(overwrite_dict.keys()).difference(set(df.columns.tolist())) - if invalid_columns: - raise ColumnNotFoundInDataFrame(f"{invalid_columns} not in DataFrame. Check spelling?") - - # recode dict in pandas terms - for col, data_type in overwrite_dict.items(): - overwrite_dict.update({col: dtypes_map[data_type]}) - - # cast - df = df.astype(overwrite_dict) - logger.debug(f"Head of cast dataframe:\n {df.head()}") - return df - - def check_columns_in_df( df: pandas.DataFrame, columns: Union[list, str], @@ -127,7 +91,12 @@ def check_columns_in_df( def check_dupe_cols(columns: list, suppress_warning: bool = False) -> Optional[list]: """checks dupes in a list""" - dupes = [item for item, count in collections.Counter(columns).items() if count > 1] + columns_without_empty_strings: List[str] = list(filter(None, columns)) + dupes = [ + item + for item, count in collections.Counter(columns_without_empty_strings).items() + if count > 1 + ] if dupes and not suppress_warning: raise DuplicatedColumnsInSheet( f"Duplicate column names found in Google Sheet: {dupes}. Aborting. Fix your sheet." diff --git a/pip-requirements.txt b/pip-requirements.txt index 2c624341..523eef37 100644 --- a/pip-requirements.txt +++ b/pip-requirements.txt @@ -2,7 +2,7 @@ requests<2.23.0 gspread==3.3.0 sqlalchemy==1.3.16 cerberus==1.3.2 -pandas==1.0.4 +pandas==1.1.1 pyyaml==5.3.1 snowflake-sqlalchemy==1.2.3 oauth2client==4.1.3 diff --git a/setup.py b/setup.py index 13882ac2..16950be6 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ "gspread==3.3.0", "sqlalchemy==1.3.16", "cerberus==1.3.2", - "pandas==1.0.4", + "pandas==1.1.1", "pyyaml==5.3.1", "snowflake-sqlalchemy==1.2.3", "oauth2client==4.1.3", diff --git a/tests/sheets.yml b/tests/sheets.yml index 9ea05168..f0740c94 100644 --- a/tests/sheets.yml +++ b/tests/sheets.yml @@ -86,3 +86,19 @@ sheets: - sheet_name: sheet_with_no_schema sheet_key: sample target_table: sample + + - sheet_name: quality_assessment_scale + sheet_key: 1YyiUlZ4BNSADdO3LEz59gikEkmA3VmrktLOppgf-zqU + target_schema: ref_tables + target_table: quality_assessment_scale + columns: + - name: Needs_Improvement + datatype: int + - name: Acceptable + datatype: int + - name: Excellent + datatype: int + - name: N_A + datatype: int + - name: Fail + datatype: int diff --git a/tests/test_utils.py b/tests/test_utils.py index 5ded9193..5058a8aa 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,4 @@ -from .mockers import CAST_DF, TO_CAST_DF, generate_test_df +from .mockers import TO_CAST_DF, generate_test_df CASTING_DICT = { "col_int": "int", @@ -7,16 +7,6 @@ } -def test_cast_pandas_dtypes(): - from core.utils import cast_pandas_dtypes - - to_cast = generate_test_df(TO_CAST_DF) - cast_df = cast_pandas_dtypes(to_cast, CASTING_DICT) - expected_cast = generate_test_df(CAST_DF) - - assert cast_df.to_dict() == expected_cast.to_dict() - - def test_check_columns_in_df(): from core.utils import check_columns_in_df