Skip to content

Commit

Permalink
release 0.7.2 (#631)
Browse files Browse the repository at this point in the history
* Strategies should not rely on pandas dtype aliases (#620)

* add test for strategy with pandas.DatetimeTZDtype using a datetime.tzinfo

* avoid coercing with string alias in strategies

* support timedelta in data synthesis strats (#621)

* fix multiindex error reporting (#622)

* Pin pylint (#629)

* bump pre-commit pylint version

* pin pylint

* remove setuptools pins

* setup.py setuptools

* add back setuptools dep

* update ci build

* update build

* update nox build

* update nox build

* exclude np.float128 type registration in MacM1 (#624)

* exclude np.float128 type registration in MacM1

* replace windows/mac m1 checks with float128 check

* fix numpy_pandas_coercible bug dealing with single element (#626)

* fix numpy_pandas_coercible bug dealing with single element

* add test

* remove empty case

* update pylint (#630)

* unpin pylint, remove setuptools constraint

* bump cache

* install simpleeval in noxfile

* re-pin pylint

* fix lint

* nox uses setuptools < 58.0.0

Co-authored-by: Jean-Francois Zinque <[email protected]>
  • Loading branch information
cosmicBboy and Jean-Francois Zinque authored Sep 22, 2021
1 parent f0ddcbf commit f4dbaca
Show file tree
Hide file tree
Showing 21 changed files with 221 additions and 101 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ env:
DEFAULT_PYTHON: 3.8
CI: "true"
# Increase this value to reset cache if environment.yml has not changed
CACHE_VERSION: 4
CACHE_VERSION: 5

jobs:

Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ repos:
args: ["--line-length=79"]

- repo: https://github.com/pycqa/pylint
rev: v2.10.2
rev: v2.11.1
hooks:
- id: pylint
args: ["--disable=import-error"]
Expand Down
3 changes: 1 addition & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,12 @@ dependencies:
- isort >= 5.7.0
- codecov
- mypy >= 0.902 # mypy no longer bundle stubs for third-party libraries
- pylint >= 2.7.2
- pylint = v2.11.1
- pytest
- pytest-cov
- pytest-xdist
- pytest-asyncio
- xdoctest
- setuptools < 58.0.0
- nox = 2020.12.31 # pinning due to UnicodeDecodeError, see https://github.com/pandera-dev/pandera/pull/504/checks?check_run_id=2841360122
- importlib_metadata # required if python < 3.8

Expand Down
4 changes: 4 additions & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ def install_extras(
if extra == "core":
specs.append(REQUIRES["all"]["hypothesis"])

# this is a temporary measure to install setuptools due to this issue:
# https://github.com/pandera-dev/pandera/pull/602#issuecomment-915622823
session.install("setuptools < 58.0.0")

# CI installs conda dependencies, so only run this for local runs
if (
isinstance(session.virtualenv, nox.virtualenv.CondaEnv)
Expand Down
17 changes: 9 additions & 8 deletions pandera/engines/numpy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,17 @@
import dataclasses
import datetime
import inspect
import platform
import warnings
from typing import Any, Dict, List, Union

import numpy as np

from .. import dtypes, errors
from ..dtypes import immutable
from ..system import FLOAT_128_AVAILABLE
from . import engine, utils
from .type_aliases import PandasObject

WINDOWS_PLATFORM = platform.system() == "Windows"


@immutable(init=True)
class DataType(dtypes.DataType):
Expand Down Expand Up @@ -226,13 +224,16 @@ class UInt8(UInt16):
_float_equivalents = _build_number_equivalents(
builtin_name="float",
pandera_name="Float",
sizes=[64, 32, 16] if WINDOWS_PLATFORM else [128, 64, 32, 16],
sizes=[128, 64, 32, 16] if FLOAT_128_AVAILABLE else [64, 32, 16],
)


if not WINDOWS_PLATFORM:
# not supported in windows
if FLOAT_128_AVAILABLE:
# not supported in windows:
# https://github.com/winpython/winpython/issues/613
#
# or Mac M1:
# https://github.com/pandera-dev/pandera/issues/623
@Engine.register_dtype(equivalents=_float_equivalents[128])
@immutable
class Float128(DataType, dtypes.Float128):
Expand Down Expand Up @@ -276,11 +277,11 @@ class Float16(Float32):
_complex_equivalents = _build_number_equivalents(
builtin_name="complex",
pandera_name="Complex",
sizes=[128, 64] if WINDOWS_PLATFORM else [256, 128, 64],
sizes=[256, 128, 64] if FLOAT_128_AVAILABLE else [128, 64],
)


if not WINDOWS_PLATFORM:
if FLOAT_128_AVAILABLE:
# not supported in windows
# https://github.com/winpython/winpython/issues/613
@Engine.register_dtype(equivalents=_complex_equivalents[256])
Expand Down
9 changes: 3 additions & 6 deletions pandera/engines/pandas_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import dataclasses
import datetime
import inspect
import platform
import warnings
from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Union
Expand All @@ -21,6 +20,7 @@

from .. import dtypes, errors
from ..dtypes import immutable
from ..system import FLOAT_128_AVAILABLE
from . import engine, numpy_engine, utils
from .type_aliases import PandasDataType, PandasExtensionType, PandasObject

Expand All @@ -39,9 +39,6 @@ def pandas_version():
from typing_extensions import Literal # type: ignore


WINDOWS_PLATFORM = platform.system() == "Windows"


def is_extension_dtype(pd_dtype: PandasDataType) -> bool:
"""Check if a value is a pandas extension type or instance of one."""
return isinstance(pd_dtype, PandasExtensionType) or (
Expand Down Expand Up @@ -343,7 +340,7 @@ class UINT8(UINT16):
_register_numpy_numbers(
builtin_name="float",
pandera_name="Float",
sizes=[64, 32, 16] if WINDOWS_PLATFORM else [128, 64, 32, 16],
sizes=[128, 64, 32, 16] if FLOAT_128_AVAILABLE else [64, 32, 16],
)

# ###############################################################################
Expand All @@ -353,7 +350,7 @@ class UINT8(UINT16):
_register_numpy_numbers(
builtin_name="complex",
pandera_name="Complex",
sizes=[128, 64] if WINDOWS_PLATFORM else [256, 128, 64],
sizes=[256, 128, 64] if FLOAT_128_AVAILABLE else [128, 64],
)

# ###############################################################################
Expand Down
2 changes: 1 addition & 1 deletion pandera/engines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _coercible(series):
except (ValueError, TypeError):
return False

search_list = _bisect(series)
search_list = [series] if series.size == 1 else _bisect(series)
failure_index = []
while search_list:
candidates = []
Expand Down
11 changes: 8 additions & 3 deletions pandera/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,14 @@ def _parse_schema_errors(schema_errors: List[Dict[str, Any]]):
schema_context=err.schema.__class__.__name__,
check=check_identifier,
check_number=err.check_index,
# explicitly wrap `column` in a list of the column key is
# a tuple in the case of MultiIndex column names.
column=[column] if isinstance(column, tuple) else column,
# if the column key is a tuple (for MultiIndex column
# names), explicitly wrap `column` in a list of the
# same length as the number of failure cases.
column=(
[column] * err.failure_cases.shape[0]
if isinstance(column, tuple)
else column
),
)
check_failure_cases.append(failure_cases[column_order])

Expand Down
4 changes: 2 additions & 2 deletions pandera/hypotheses.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ def _relationships(self, relationship: Union[str, Callable]):
relationship = self.RELATIONSHIPS[relationship]
elif not callable(relationship):
raise ValueError(
"expected relationship to be str or callable, found %s"
% type(relationship)
"expected relationship to be str or callable, found "
f"{type(relationship)}"
)
return relationship

Expand Down
6 changes: 3 additions & 3 deletions pandera/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def from_yaml(yaml_schema):
:returns: dataframe schema.
"""
try:
with Path(yaml_schema).open("r") as f:
with Path(yaml_schema).open("r", encoding="utf-8") as f:
serialized_schema = yaml.safe_load(f)
except (TypeError, OSError):
serialized_schema = yaml.safe_load(yaml_schema)
Expand All @@ -290,7 +290,7 @@ def _write_yaml(obj, stream):
return yaml.safe_dump(obj, stream=stream, sort_keys=False)

try:
with Path(stream).open("w") as f:
with Path(stream).open("w", encoding="utf-8") as f:
_write_yaml(statistics, f)
except (TypeError, OSError):
return _write_yaml(statistics, stream)
Expand Down Expand Up @@ -437,7 +437,7 @@ def to_script(dataframe_schema, path_or_buf=None):
if path_or_buf is None:
return formatted_script

with Path(path_or_buf).open("w") as f:
with Path(path_or_buf).open("w", encoding="utf-8") as f:
f.write(formatted_script)


Expand Down
41 changes: 19 additions & 22 deletions pandera/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import numpy as np
import pandas as pd

from . import constants, errors
from . import errors
from . import strategies as st
from .checks import Check
from .deprecations import deprecate_pandas_dtype
Expand Down Expand Up @@ -278,9 +278,9 @@ def dtypes(self) -> Dict[str, DataType]:
]
if regex_columns:
warnings.warn(
"Schema has columns specified as regex column names: %s "
"Use the `get_dtypes` to get the datatypes for these "
"columns." % regex_columns,
"Schema has columns specified as regex column names: "
f"{regex_columns}. Use the `get_dtypes` to get the datatypes "
"for these columns.",
UserWarning,
)
return {n: c.dtype for n, c in self.columns.items() if not c.regex}
Expand Down Expand Up @@ -460,11 +460,10 @@ def validate(

if self._is_inferred:
warnings.warn(
"This %s is an inferred schema that hasn't been "
f"This {type(self)} is an inferred schema that hasn't been "
"modified. It's recommended that you refine the schema "
"by calling `add_columns`, `remove_columns`, or "
"`update_columns` before using it to validate data."
% type(self),
"`update_columns` before using it to validate data.",
UserWarning,
)

Expand Down Expand Up @@ -1744,10 +1743,9 @@ def validate(

if self._is_inferred:
warnings.warn(
"This %s is an inferred schema that hasn't been "
f"This {type(self)} is an inferred schema that hasn't been "
"modified. It's recommended that you refine the schema "
"by calling `set_checks` before using it to validate data."
% type(self),
"by calling `set_checks` before using it to validate data.",
UserWarning,
)

Expand All @@ -1771,10 +1769,9 @@ def validate(
)

if self.name is not None and series.name != self._name:
msg = "Expected %s to have name '%s', found '%s'" % (
type(self),
self._name,
series.name,
msg = (
f"Expected {type(self)} to have name '{self._name}', found "
f"'{series.name}'"
)
error_handler.collect_error(
"wrong_field_name",
Expand All @@ -1790,9 +1787,10 @@ def validate(
if not self._nullable:
nulls = series.isna()
if sum(nulls) > 0:
msg = "non-nullable series '%s' contains null values: %s" % (
series.name,
series[nulls].head(constants.N_FAILURE_CASES).to_dict(),
failed = series[nulls]
msg = (
f"non-nullable series '{series.name}' contains null "
f"values:\n{failed}"
)
error_handler.collect_error(
"series_contains_nulls",
Expand All @@ -1811,11 +1809,10 @@ def validate(
if self._unique:
duplicates = series.duplicated()
if any(duplicates):
msg = "series '%s' contains duplicate values: %s" % (
series.name,
series[duplicates]
.head(constants.N_FAILURE_CASES)
.to_dict(),
failed = series[duplicates]
msg = (
f"series '{series.name}' contains duplicate values:\n"
f"{series[duplicates]}"
)
error_handler.collect_error(
"series_contains_duplicates",
Expand Down
31 changes: 14 additions & 17 deletions pandera/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,6 @@ def _to_datetime(value) -> pd.DatetimeTZDtype:
return st.builds(dtype.type, strategy, res)


def _to_unix_timestamp(value: Any) -> int:
return pd.Timestamp(value).value


def numpy_time_dtypes(
dtype: Union[np.dtype, pd.DatetimeTZDtype], min_value=None, max_value=None
):
Expand All @@ -259,12 +255,14 @@ def numpy_time_dtypes(
:param max_value: maximum value of the datatype to create
:returns: ``hypothesis`` strategy
"""
min_value = (
MIN_DT_VALUE if min_value is None else _to_unix_timestamp(min_value)
)
max_value = (
MAX_DT_VALUE if max_value is None else _to_unix_timestamp(max_value)
)

def _to_unix(value: Any) -> int:
if dtype.type is np.timedelta64:
return pd.Timedelta(value).value
return pd.Timestamp(value).value

min_value = MIN_DT_VALUE if min_value is None else _to_unix(min_value)
max_value = MAX_DT_VALUE if max_value is None else _to_unix(max_value)
return _datetime_strategy(dtype, st.integers(min_value, max_value))


Expand Down Expand Up @@ -831,7 +829,7 @@ def series_strategy(
)
.filter(lambda x: x.shape[0] > 0)
.map(lambda x: x.rename(name))
.map(lambda x: x.astype(str(pandera_dtype)))
.map(lambda x: x.astype(pandera_dtype.type))
)
if nullable:
strategy = null_field_masks(strategy)
Expand Down Expand Up @@ -918,7 +916,7 @@ def index_strategy(
min_size=0 if size is None else size,
max_size=size,
unique=unique,
).map(lambda x: x.astype(str(pandera_dtype)))
).map(lambda x: x.astype(pandera_dtype.type))
if name is not None:
strategy = strategy.map(lambda index: index.rename(name))
if nullable:
Expand Down Expand Up @@ -1068,12 +1066,11 @@ def _dataframe_strategy(draw):
# override the column datatype with dataframe-level datatype if
# specified
col_dtypes = {
col_name: str(col.dtype)
col_name: col.dtype.type
if pandera_dtype is None
else str(pandera_dtype)
else pandera_dtype.type
for col_name, col in expanded_columns.items()
}

nullable_columns = {
col_name: col.nullable
for col_name, col in expanded_columns.items()
Expand Down Expand Up @@ -1132,7 +1129,7 @@ def multiindex_strategy(
:param pandera_dtype: :class:`pandera.dtypes.DataType` instance.
:param strategy: an optional hypothesis strategy. If specified, the
pandas dtype strategy will be chained onto this strategy.
:param indexes: a list of :class:`~pandera.schema_components.Inded`
:param indexes: a list of :class:`~pandera.schema_components.Index`
objects.
:param size: number of elements in the Series.
:returns: ``hypothesis`` strategy.
Expand All @@ -1145,7 +1142,7 @@ def multiindex_strategy(
)
indexes = [] if indexes is None else indexes
index_dtypes = {
index.name if index.name is not None else i: str(index.dtype)
index.name if index.name is not None else i: index.dtype.type
for i, index in enumerate(indexes)
}
nullable_index = {
Expand Down
7 changes: 7 additions & 0 deletions pandera/system.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Global variables relating to OS."""

import numpy as np

# Windows and Mac M1 don't support floats of this precision:
# https://github.com/pandera-dev/pandera/issues/623
FLOAT_128_AVAILABLE = hasattr(np, "float128")
Loading

0 comments on commit f4dbaca

Please sign in to comment.