release 0.7.2 (#631)

* Strategies should not rely on pandas dtype aliases (#620) * add test for strategy with pandas.DatetimeTZDtype using a datetime.tzinfo * avoid coercing with string alias in strategies * support timedelta in data synthesis strats (#621) * fix multiindex error reporting (#622) * Pin pylint (#629) * bump pre-commit pylint version * pin pylint * remove setuptools pins * setup.py setuptools * add back setuptools dep * update ci build * update build * update nox build * update nox build * exclude np.float128 type registration in MacM1 (#624) * exclude np.float128 type registration in MacM1 * replace windows/mac m1 checks with float128 check * fix numpy_pandas_coercible bug dealing with single element (#626) * fix numpy_pandas_coercible bug dealing with single element * add test * remove empty case * update pylint (#630) * unpin pylint, remove setuptools constraint * bump cache * install simpleeval in noxfile * re-pin pylint * fix lint * nox uses setuptools < 58.0.0 Co-authored-by: Jean-Francois Zinque <[email protected]>
unionai-oss · Sep 22, 2021 · f4dbaca · f4dbaca
1 parent f0ddcbf
commit f4dbaca
Show file tree

Hide file tree

Showing 21 changed files with 221 additions and 101 deletions.
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -17,7 +17,7 @@ env:
   DEFAULT_PYTHON: 3.8
   CI: "true"
   # Increase this value to reset cache if environment.yml has not changed
-  CACHE_VERSION: 4
+  CACHE_VERSION: 5
 
 jobs:
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -41,7 +41,7 @@ repos:
         args: ["--line-length=79"]
 
   - repo: https://github.com/pycqa/pylint
-    rev: v2.10.2
+    rev: v2.11.1
     hooks:
       - id: pylint
         args: ["--disable=import-error"]

diff --git a/environment.yml b/environment.yml
@@ -26,13 +26,12 @@ dependencies:
   - isort >= 5.7.0
   - codecov
   - mypy >= 0.902 # mypy no longer bundle stubs for third-party libraries
-  - pylint >= 2.7.2
+  - pylint = v2.11.1
   - pytest
   - pytest-cov
   - pytest-xdist
   - pytest-asyncio
   - xdoctest
-  - setuptools < 58.0.0
   - nox = 2020.12.31 # pinning due to UnicodeDecodeError, see https://github.com/pandera-dev/pandera/pull/504/checks?check_run_id=2841360122
   - importlib_metadata # required if python < 3.8
 

diff --git a/noxfile.py b/noxfile.py
@@ -187,6 +187,10 @@ def install_extras(
     if extra == "core":
         specs.append(REQUIRES["all"]["hypothesis"])
 
+    # this is a temporary measure to install setuptools due to this issue:
+    # https://github.com/pandera-dev/pandera/pull/602#issuecomment-915622823
+    session.install("setuptools < 58.0.0")
+
     # CI installs conda dependencies, so only run this for local runs
     if (
         isinstance(session.virtualenv, nox.virtualenv.CondaEnv)

diff --git a/pandera/engines/numpy_engine.py b/pandera/engines/numpy_engine.py
@@ -5,19 +5,17 @@
 import dataclasses
 import datetime
 import inspect
-import platform
 import warnings
 from typing import Any, Dict, List, Union
 
 import numpy as np
 
 from .. import dtypes, errors
 from ..dtypes import immutable
+from ..system import FLOAT_128_AVAILABLE
 from . import engine, utils
 from .type_aliases import PandasObject
 
-WINDOWS_PLATFORM = platform.system() == "Windows"
-
 
 @immutable(init=True)
 class DataType(dtypes.DataType):
@@ -226,13 +224,16 @@ class UInt8(UInt16):
 _float_equivalents = _build_number_equivalents(
     builtin_name="float",
     pandera_name="Float",
-    sizes=[64, 32, 16] if WINDOWS_PLATFORM else [128, 64, 32, 16],
+    sizes=[128, 64, 32, 16] if FLOAT_128_AVAILABLE else [64, 32, 16],
 )
 
 
-if not WINDOWS_PLATFORM:
-    # not supported in windows
+if FLOAT_128_AVAILABLE:
+    # not supported in windows:
     # https://github.com/winpython/winpython/issues/613
+    #
+    # or Mac M1:
+    # https://github.com/pandera-dev/pandera/issues/623
     @Engine.register_dtype(equivalents=_float_equivalents[128])
     @immutable
     class Float128(DataType, dtypes.Float128):
@@ -276,11 +277,11 @@ class Float16(Float32):
 _complex_equivalents = _build_number_equivalents(
     builtin_name="complex",
     pandera_name="Complex",
-    sizes=[128, 64] if WINDOWS_PLATFORM else [256, 128, 64],
+    sizes=[256, 128, 64] if FLOAT_128_AVAILABLE else [128, 64],
 )
 
 
-if not WINDOWS_PLATFORM:
+if FLOAT_128_AVAILABLE:
     # not supported in windows
     # https://github.com/winpython/winpython/issues/613
     @Engine.register_dtype(equivalents=_complex_equivalents[256])

diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py
@@ -10,7 +10,6 @@
 import dataclasses
 import datetime
 import inspect
-import platform
 import warnings
 from enum import Enum
 from typing import Any, Dict, Iterable, List, Optional, Union
@@ -21,6 +20,7 @@
 
 from .. import dtypes, errors
 from ..dtypes import immutable
+from ..system import FLOAT_128_AVAILABLE
 from . import engine, numpy_engine, utils
 from .type_aliases import PandasDataType, PandasExtensionType, PandasObject
 
@@ -39,9 +39,6 @@ def pandas_version():
     from typing_extensions import Literal  # type: ignore
 
 
-WINDOWS_PLATFORM = platform.system() == "Windows"
-
-
 def is_extension_dtype(pd_dtype: PandasDataType) -> bool:
     """Check if a value is a pandas extension type or instance of one."""
     return isinstance(pd_dtype, PandasExtensionType) or (
@@ -343,7 +340,7 @@ class UINT8(UINT16):
 _register_numpy_numbers(
     builtin_name="float",
     pandera_name="Float",
-    sizes=[64, 32, 16] if WINDOWS_PLATFORM else [128, 64, 32, 16],
+    sizes=[128, 64, 32, 16] if FLOAT_128_AVAILABLE else [64, 32, 16],
 )
 
 # ###############################################################################
@@ -353,7 +350,7 @@ class UINT8(UINT16):
 _register_numpy_numbers(
     builtin_name="complex",
     pandera_name="Complex",
-    sizes=[128, 64] if WINDOWS_PLATFORM else [256, 128, 64],
+    sizes=[256, 128, 64] if FLOAT_128_AVAILABLE else [128, 64],
 )
 
 # ###############################################################################

diff --git a/pandera/engines/utils.py b/pandera/engines/utils.py
@@ -30,7 +30,7 @@ def _coercible(series):
         except (ValueError, TypeError):
             return False
 
-    search_list = _bisect(series)
+    search_list = [series] if series.size == 1 else _bisect(series)
     failure_index = []
     while search_list:
         candidates = []

diff --git a/pandera/errors.py b/pandera/errors.py
@@ -171,9 +171,14 @@ def _parse_schema_errors(schema_errors: List[Dict[str, Any]]):
                     schema_context=err.schema.__class__.__name__,
                     check=check_identifier,
                     check_number=err.check_index,
-                    # explicitly wrap `column` in a list of the column key is
-                    # a tuple in the case of MultiIndex column names.
-                    column=[column] if isinstance(column, tuple) else column,
+                    # if the column key is a tuple (for MultiIndex column
+                    # names), explicitly wrap `column` in a list of the
+                    # same length as the number of failure cases.
+                    column=(
+                        [column] * err.failure_cases.shape[0]
+                        if isinstance(column, tuple)
+                        else column
+                    ),
                 )
                 check_failure_cases.append(failure_cases[column_order])
 

diff --git a/pandera/hypotheses.py b/pandera/hypotheses.py
@@ -210,8 +210,8 @@ def _relationships(self, relationship: Union[str, Callable]):
             relationship = self.RELATIONSHIPS[relationship]
         elif not callable(relationship):
             raise ValueError(
-                "expected relationship to be str or callable, found %s"
-                % type(relationship)
+                "expected relationship to be str or callable, found "
+                f"{type(relationship)}"
             )
         return relationship
 

diff --git a/pandera/io.py b/pandera/io.py
@@ -270,7 +270,7 @@ def from_yaml(yaml_schema):
     :returns: dataframe schema.
     """
     try:
-        with Path(yaml_schema).open("r") as f:
+        with Path(yaml_schema).open("r", encoding="utf-8") as f:
             serialized_schema = yaml.safe_load(f)
     except (TypeError, OSError):
         serialized_schema = yaml.safe_load(yaml_schema)
@@ -290,7 +290,7 @@ def _write_yaml(obj, stream):
         return yaml.safe_dump(obj, stream=stream, sort_keys=False)
 
     try:
-        with Path(stream).open("w") as f:
+        with Path(stream).open("w", encoding="utf-8") as f:
             _write_yaml(statistics, f)
     except (TypeError, OSError):
         return _write_yaml(statistics, stream)
@@ -437,7 +437,7 @@ def to_script(dataframe_schema, path_or_buf=None):
     if path_or_buf is None:
         return formatted_script
 
-    with Path(path_or_buf).open("w") as f:
+    with Path(path_or_buf).open("w", encoding="utf-8") as f:
         f.write(formatted_script)
 
 

diff --git a/pandera/schemas.py b/pandera/schemas.py
@@ -13,7 +13,7 @@
 import numpy as np
 import pandas as pd
 
-from . import constants, errors
+from . import errors
 from . import strategies as st
 from .checks import Check
 from .deprecations import deprecate_pandas_dtype
@@ -278,9 +278,9 @@ def dtypes(self) -> Dict[str, DataType]:
         ]
         if regex_columns:
             warnings.warn(
-                "Schema has columns specified as regex column names: %s "
-                "Use the `get_dtypes` to get the datatypes for these "
-                "columns." % regex_columns,
+                "Schema has columns specified as regex column names: "
+                f"{regex_columns}. Use the `get_dtypes` to get the datatypes "
+                "for these columns.",
                 UserWarning,
             )
         return {n: c.dtype for n, c in self.columns.items() if not c.regex}
@@ -460,11 +460,10 @@ def validate(
 
         if self._is_inferred:
             warnings.warn(
-                "This %s is an inferred schema that hasn't been "
+                f"This {type(self)} is an inferred schema that hasn't been "
                 "modified. It's recommended that you refine the schema "
                 "by calling `add_columns`, `remove_columns`, or "
-                "`update_columns` before using it to validate data."
-                % type(self),
+                "`update_columns` before using it to validate data.",
                 UserWarning,
             )
 
@@ -1744,10 +1743,9 @@ def validate(
 
         if self._is_inferred:
             warnings.warn(
-                "This %s is an inferred schema that hasn't been "
+                f"This {type(self)} is an inferred schema that hasn't been "
                 "modified. It's recommended that you refine the schema "
-                "by calling `set_checks` before using it to validate data."
-                % type(self),
+                "by calling `set_checks` before using it to validate data.",
                 UserWarning,
             )
 
@@ -1771,10 +1769,9 @@ def validate(
         )
 
         if self.name is not None and series.name != self._name:
-            msg = "Expected %s to have name '%s', found '%s'" % (
-                type(self),
-                self._name,
-                series.name,
+            msg = (
+                f"Expected {type(self)} to have name '{self._name}', found "
+                f"'{series.name}'"
             )
             error_handler.collect_error(
                 "wrong_field_name",
@@ -1790,9 +1787,10 @@ def validate(
         if not self._nullable:
             nulls = series.isna()
             if sum(nulls) > 0:
-                msg = "non-nullable series '%s' contains null values: %s" % (
-                    series.name,
-                    series[nulls].head(constants.N_FAILURE_CASES).to_dict(),
+                failed = series[nulls]
+                msg = (
+                    f"non-nullable series '{series.name}' contains null "
+                    f"values:\n{failed}"
                 )
                 error_handler.collect_error(
                     "series_contains_nulls",
@@ -1811,11 +1809,10 @@ def validate(
         if self._unique:
             duplicates = series.duplicated()
             if any(duplicates):
-                msg = "series '%s' contains duplicate values: %s" % (
-                    series.name,
-                    series[duplicates]
-                    .head(constants.N_FAILURE_CASES)
-                    .to_dict(),
+                failed = series[duplicates]
+                msg = (
+                    f"series '{series.name}' contains duplicate values:\n"
+                    f"{series[duplicates]}"
                 )
                 error_handler.collect_error(
                     "series_contains_duplicates",

diff --git a/pandera/strategies.py b/pandera/strategies.py
@@ -245,10 +245,6 @@ def _to_datetime(value) -> pd.DatetimeTZDtype:
         return st.builds(dtype.type, strategy, res)
 
 
-def _to_unix_timestamp(value: Any) -> int:
-    return pd.Timestamp(value).value
-
-
 def numpy_time_dtypes(
     dtype: Union[np.dtype, pd.DatetimeTZDtype], min_value=None, max_value=None
 ):
@@ -259,12 +255,14 @@ def numpy_time_dtypes(
     :param max_value: maximum value of the datatype to create
     :returns: ``hypothesis`` strategy
     """
-    min_value = (
-        MIN_DT_VALUE if min_value is None else _to_unix_timestamp(min_value)
-    )
-    max_value = (
-        MAX_DT_VALUE if max_value is None else _to_unix_timestamp(max_value)
-    )
+
+    def _to_unix(value: Any) -> int:
+        if dtype.type is np.timedelta64:
+            return pd.Timedelta(value).value
+        return pd.Timestamp(value).value
+
+    min_value = MIN_DT_VALUE if min_value is None else _to_unix(min_value)
+    max_value = MAX_DT_VALUE if max_value is None else _to_unix(max_value)
     return _datetime_strategy(dtype, st.integers(min_value, max_value))
 
 
@@ -831,7 +829,7 @@ def series_strategy(
         )
         .filter(lambda x: x.shape[0] > 0)
         .map(lambda x: x.rename(name))
-        .map(lambda x: x.astype(str(pandera_dtype)))
+        .map(lambda x: x.astype(pandera_dtype.type))
     )
     if nullable:
         strategy = null_field_masks(strategy)
@@ -918,7 +916,7 @@ def index_strategy(
         min_size=0 if size is None else size,
         max_size=size,
         unique=unique,
-    ).map(lambda x: x.astype(str(pandera_dtype)))
+    ).map(lambda x: x.astype(pandera_dtype.type))
     if name is not None:
         strategy = strategy.map(lambda index: index.rename(name))
     if nullable:
@@ -1068,12 +1066,11 @@ def _dataframe_strategy(draw):
         # override the column datatype with dataframe-level datatype if
         # specified
         col_dtypes = {
-            col_name: str(col.dtype)
+            col_name: col.dtype.type
             if pandera_dtype is None
-            else str(pandera_dtype)
+            else pandera_dtype.type
             for col_name, col in expanded_columns.items()
         }
-
         nullable_columns = {
             col_name: col.nullable
             for col_name, col in expanded_columns.items()
@@ -1132,7 +1129,7 @@ def multiindex_strategy(
     :param pandera_dtype: :class:`pandera.dtypes.DataType` instance.
     :param strategy: an optional hypothesis strategy. If specified, the
         pandas dtype strategy will be chained onto this strategy.
-    :param indexes: a list of :class:`~pandera.schema_components.Inded`
+    :param indexes: a list of :class:`~pandera.schema_components.Index`
         objects.
     :param size: number of elements in the Series.
     :returns: ``hypothesis`` strategy.
@@ -1145,7 +1142,7 @@ def multiindex_strategy(
         )
     indexes = [] if indexes is None else indexes
     index_dtypes = {
-        index.name if index.name is not None else i: str(index.dtype)
+        index.name if index.name is not None else i: index.dtype.type
         for i, index in enumerate(indexes)
     }
     nullable_index = {

diff --git a/pandera/system.py b/pandera/system.py
@@ -0,0 +1,7 @@
+"""Global variables relating to OS."""
+
+import numpy as np
+
+# Windows and Mac M1 don't support floats of this precision:
+# https://github.com/pandera-dev/pandera/issues/623
+FLOAT_128_AVAILABLE = hasattr(np, "float128")