Skip to content

Commit

Permalink
Enhancement: Add support for timezone-flexible DateTime (#1352) (#1902)
Browse files Browse the repository at this point in the history
Enhancement: Add support for timezone-flexible DateTime (#1352)

Signed-off-by: Max Raphael <[email protected]>
  • Loading branch information
max-raphael authored Feb 12, 2025
1 parent 754e66d commit 32b08fd
Show file tree
Hide file tree
Showing 2 changed files with 280 additions and 9 deletions.
110 changes: 110 additions & 0 deletions pandera/engines/pandas_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,19 @@ class DateTime(_BaseDateTime, dtypes.Timestamp):
tz: Optional[datetime.tzinfo] = None
"""The timezone."""

time_zone_agnostic: bool = False
"""
A flag indicating whether the datetime data should be handled flexibly with respect to timezones.
- If set to `True` and `coerce` is `False`, the function will accept datetimes with any timezone(s)
but not timezone-naive datetimes. If passed, the `tz` argument will be ignored, as this use
case is handled by setting `time_zone_agnostic=False`.
- If set to `True` and `coerce` is `True`, a `tz` must also be specified. The function will then
accept datetimes with any timezone(s) and convert them to the specified tz, as well as
timezone-naive datetimes, and localize them to the specified tz.
"""

to_datetime_kwargs: Dict[str, Any] = dataclasses.field(
default_factory=dict, compare=False, repr=False
)
Expand Down Expand Up @@ -936,14 +949,111 @@ def from_parametrized_dtype(cls, pd_dtype: pd.DatetimeTZDtype):
return cls(unit=pd_dtype.unit, tz=pd_dtype.tz) # type: ignore

def coerce(self, data_container: PandasObject) -> PandasObject:
if self.time_zone_agnostic:
data_container = self._prepare_coerce_time_zone_agnostic(
data_container=data_container
)
return self._coerce(data_container, pandas_dtype=self.type)

def _prepare_coerce_time_zone_agnostic(
self, data_container: PandasObject
) -> PandasObject:
if not self.tz:
raise errors.ParserError(
"Cannot coerce datetimes when 'time_zone_agnostic=True' and 'tz' is not specified. "
"When using 'time_zone_agnostic' and 'coerce', you must specify a timezone using 'tz' parameter.",
failure_cases=utils.numpy_pandas_coerce_failure_cases(
data_container, self
),
)
# If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
if isinstance(data_container.dtype, pd.DatetimeTZDtype):
tz = self.tz
unit = self.unit if self.unit else data_container.dtype.unit
type_ = pd.DatetimeTZDtype(unit, tz)
object.__setattr__(self, "tz", tz)
object.__setattr__(self, "type", type_)
# If there are multiple timezones, convert them to the specified tz and set the type accordingly
elif all(isinstance(x, datetime.datetime) for x in data_container):
container_type = type(data_container)
tz = self.tz
unit = self.unit if self.unit else data_container.dtype.unit
data_container = container_type(
[
(
pd.Timestamp(ts).tz_convert(tz)
if pd.Timestamp(ts).tzinfo
else pd.Timestamp(ts).tz_localize(tz)
)
for ts in data_container
]
)
type_ = pd.DatetimeTZDtype(unit, tz)
object.__setattr__(self, "tz", tz)
object.__setattr__(self, "type", type_)
else:
raise errors.ParserError(
"When time_zone_agnostic=True, data must either be:\n"
"1. A Series with DatetimeTZDtype (timezone-aware datetime series), or\n"
"2. A Series of datetime objects\n"
f"Got data with dtype: {data_container.dtype}",
failure_cases=utils.numpy_pandas_coerce_failure_cases(
data_container, self
),
)
return data_container

def coerce_value(self, value: Any) -> Any:
"""Coerce an value to specified datatime type."""
return self._get_to_datetime_fn(value)(
value, **self.to_datetime_kwargs
)

def check(
self,
pandera_dtype: dtypes.DataType,
data_container: Optional[PandasObject] = None,
) -> Union[bool, Iterable[bool]]:
if self.time_zone_agnostic:
self._prepare_check_time_zone_agnostic(
pandera_dtype=pandera_dtype, data_container=data_container
)
return super().check(pandera_dtype, data_container)

def _prepare_check_time_zone_agnostic(
self,
pandera_dtype: dtypes.DataType,
data_container: Optional[PandasObject],
) -> None:
# If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
if (
isinstance(pandera_dtype, DateTime)
and pandera_dtype.tz is not None
):
type_ = pd.DatetimeTZDtype(self.unit, pandera_dtype.tz)
object.__setattr__(self, "tz", pandera_dtype.tz)
object.__setattr__(self, "type", type_)
# If the data has a mix of timezones, pandas defines the dtype as 'object`
elif all(
isinstance(x, datetime.datetime) and x.tzinfo is not None
for x in data_container # type: ignore
):
object.__setattr__(self, "type", np.dtype("O"))
else:
raise errors.ParserError(
"When time_zone_agnostic=True, data must either be:\n"
"1. A Series with DatetimeTZDtype (timezone-aware datetime series), or\n"
"2. A Series of timezone-aware datetime objects\n"
f"Got data with dtype: {data_container.dtype if data_container is not None else 'None'}",
failure_cases=(
utils.numpy_pandas_coerce_failure_cases(
data_container, self
)
if data_container is not None
else None
),
)

def __str__(self) -> str:
if self.type == np.dtype("datetime64[ns]"):
return "datetime64[ns]"
Expand Down
179 changes: 170 additions & 9 deletions tests/core/test_pandas_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Test pandas engine."""

from datetime import date
from typing import Any, Set
import datetime as dt
from typing import Tuple, List, Optional, Any, Set

import hypothesis
import hypothesis.extra.pandas as pd_st
Expand All @@ -13,8 +13,9 @@
import pytz
from hypothesis import given

from pandera import Field, DataFrameModel, errors
from pandera.engines import pandas_engine
from pandera.errors import ParserError
from pandera.errors import ParserError, SchemaError

UNSUPPORTED_DTYPE_CLS: Set[Any] = set()

Expand Down Expand Up @@ -202,6 +203,165 @@ def test_pandas_datetimetz_dtype(timezone_aware, data, timezone):
assert coerced_data.dt.tz == timezone


def generate_test_cases_time_zone_agnostic() -> List[
Tuple[
List[dt.datetime],
Optional[dt.tzinfo],
bool,
List[dt.datetime],
bool,
]
]:
"""
Generate test parameter combinations for a given list of datetime lists.
Returns:
List of tuples:
- List of input datetimes
- tz for DateTime constructor
- coerce flag for Field constructor
- expected output datetimes
- raises flag (True if an exception is expected, False otherwise)
"""
datetimes = [
# multi tz and tz naive
[
pytz.timezone("America/New_York").localize(
dt.datetime(2023, 3, 1, 4)
),
pytz.timezone("America/Los_Angeles").localize(
dt.datetime(2023, 3, 1, 5)
),
dt.datetime(2023, 3, 1, 5), # naive datetime
],
# multi tz
[
pytz.timezone("America/New_York").localize(
dt.datetime(2023, 3, 1, 4)
),
pytz.timezone("America/Los_Angeles").localize(
dt.datetime(2023, 3, 1, 5)
),
],
# tz naive
[dt.datetime(2023, 3, 1, 4), dt.datetime(2023, 3, 1, 5)],
# single tz
[
pytz.timezone("America/New_York").localize(
dt.datetime(2023, 3, 1, 4)
),
pytz.timezone("America/New_York").localize(
dt.datetime(2023, 3, 1, 5)
),
],
]

test_cases = []

for datetime_list in datetimes:
for coerce in [True, False]:
for tz in [
None,
pytz.timezone("America/Chicago"),
pytz.FixedOffset(120), # 120 minutes = 2 hours offset
]:
# Determine if the test should raise an exception
# Should raise error when:
# * coerce is False but there is a timezone-naive datetime
# * coerce is True but tz is not set
has_naive_datetime = any(
dt.tzinfo is None for dt in datetime_list
)
raises = (not coerce and has_naive_datetime) or (
coerce and tz is None
)

# Generate expected output
if raises:
expected_output = None # No expected output since an exception will be raised
else:
if coerce:
# Replace naive datetimes with localized ones
expected_output_naive = [
tz.localize(dtime) if tz is not None else dtime
for dtime in datetime_list
if dtime.tzinfo is None
]

# Convert timezone-aware datetimes to the desired timezone
expected_output_aware = [
dtime.astimezone(
tz
) # Use .astimezone() for aware datetimes
for dtime in datetime_list
if dtime.tzinfo is not None
]
expected_output = (
expected_output_naive + expected_output_aware
)
else:
# ignore tz
expected_output = datetime_list

test_case = (
datetime_list,
tz,
coerce,
expected_output,
raises,
)
test_cases.append(test_case)

# define final test cases with improper type
datetime_list = [
pytz.timezone("America/New_York").localize(
dt.datetime(
2023,
3,
1,
4,
)
),
"hello world",
]
tz = None
expected_output = None
raises = True

bad_type_coerce = (datetime_list, tz, True, expected_output, raises)
bad_type_no_coerce = (datetime_list, tz, False, expected_output, raises)
test_cases.extend([bad_type_coerce, bad_type_no_coerce]) # type: ignore

return test_cases # type: ignore


@pytest.mark.parametrize(
"examples, tz, coerce, expected_output, raises",
generate_test_cases_time_zone_agnostic(),
)
def test_dt_time_zone_agnostic(examples, tz, coerce, expected_output, raises):
"""Test that time_zone_agnostic works as expected"""

# Testing using a pandera DataFrameModel rather than directly calling dtype coerce or validate because with
# time_zone_agnostic, dtype is set dynamically based on the input data
class SimpleSchema(DataFrameModel):
# pylint: disable=unexpected-keyword-arg,no-value-for-parameter
datetime_column: pandas_engine.DateTime(
time_zone_agnostic=True, tz=tz
) = Field(coerce=coerce)

data = pd.DataFrame({"datetime_column": examples})

if raises:
with pytest.raises((SchemaError, errors.ParserError)):
SimpleSchema.validate(data)
else:
validated_df = SimpleSchema.validate(data)
assert sorted(validated_df["datetime_column"].tolist()) == sorted(
expected_output
)


@hypothesis.settings(max_examples=1000)
@pytest.mark.parametrize("to_df", [True, False])
@given(
Expand All @@ -225,7 +385,7 @@ def test_pandas_date_coerce_dtype(to_df, data):
)

assert (
coerced_data.applymap(lambda x: isinstance(x, date))
coerced_data.applymap(lambda x: isinstance(x, dt.date))
| coerced_data.isna()
).all(axis=None)
return
Expand All @@ -234,7 +394,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
coerced_data.isna().all() and coerced_data.dtype == "datetime64[ns]"
)
assert (
coerced_data.map(lambda x: isinstance(x, date)) | coerced_data.isna()
coerced_data.map(lambda x: isinstance(x, dt.date))
| coerced_data.isna()
).all()


Expand All @@ -246,8 +407,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
pyarrow.struct([("foo", pyarrow.int64()), ("bar", pyarrow.string())]),
),
(pd.Series([None, pd.NA, np.nan]), pyarrow.null),
(pd.Series([None, date(1970, 1, 1)]), pyarrow.date32),
(pd.Series([None, date(1970, 1, 1)]), pyarrow.date64),
(pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date32),
(pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date64),
(pd.Series([1, 2]), pyarrow.duration("ns")),
(pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time32("ms")),
(pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time64("ns")),
Expand Down Expand Up @@ -292,8 +453,8 @@ def test_pandas_arrow_dtype(data, dtype):
pyarrow.struct([("foo", pyarrow.string()), ("bar", pyarrow.int64())]),
),
(pd.Series(["a", "1"]), pyarrow.null),
(pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
(pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
(pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
(pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
(pd.Series(["a"]), pyarrow.duration("ns")),
(pd.Series(["a", "b"]), pyarrow.time32("ms")),
(pd.Series(["a", "b"]), pyarrow.time64("ns")),
Expand Down

0 comments on commit 32b08fd

Please sign in to comment.