Skip to content

POC: consistent NaN treatment for pyarrow dtypes #61732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@
def using_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]


def using_pyarrow_strict_nans() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["pyarrow_strict_nans"]
1 change: 1 addition & 0 deletions pandas/_libs/missing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
def checknull(val: object) -> bool: ...
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
18 changes: 18 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA


@cython.wraparound(False)
@cython.boundscheck(False)
def is_pdna_or_none(values: ndarray) -> ndarray:
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val

N = len(values)
result = np.zeros(N, dtype=np.uint8)

for i in range(N):
val = values[i]
if val is None or val is C_NA:
result[i] = True
return result.view(bool)


@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,7 @@ def _maybe_upcast(
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
arr = ArrowExtensionArray(pa.array(arr))

return arr

Expand Down
15 changes: 13 additions & 2 deletions pandas/core/arrays/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

import numpy as np

from pandas._config import using_pyarrow_strict_nans

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas.errors import LossySetitemError

from pandas.core.dtypes.cast import np_can_hold_element
Expand All @@ -21,7 +24,11 @@


def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
arr: ArrayLike,
dtype: npt.DTypeLike | None,
na_value,
hasna: bool,
is_pyarrow: bool = True,
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
Expand All @@ -34,7 +41,11 @@ def to_numpy_dtype_inference(
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
if is_pyarrow and using_pyarrow_strict_nans():
na_value = NA
dtype = np.dtype(object)
else:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
Expand Down
63 changes: 49 additions & 14 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@

import numpy as np

from pandas._config import using_pyarrow_strict_nans

from pandas._libs import lib
from pandas._libs.missing import is_pdna_or_none
from pandas._libs.tslibs import (
Timedelta,
Timestamp,
Expand Down Expand Up @@ -323,6 +326,11 @@ def _from_sequence_of_strings(
"""
Construct a new ExtensionArray from a sequence of strings.
"""
mask = isna(strings)

if isinstance(strings, cls):
strings = strings._pa_array

pa_type = to_pyarrow_type(dtype)
if (
pa_type is None
Expand All @@ -341,17 +349,21 @@ def _from_sequence_of_strings(
from pandas.core.tools.datetimes import to_datetime

scalars = to_datetime(strings, errors="raise").date

scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)

elif pa.types.is_duration(pa_type):
from pandas.core.tools.timedeltas import to_timedelta

scalars = to_timedelta(strings, errors="raise")

if pa_type.unit != "ns":
# GH51175: test_from_sequence_of_strings_pa_array
# attempt to parse as int64 reflecting pyarrow's
# duration to string casting behavior
mask = isna(scalars)
if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
strings = pa.array(strings, type=pa.string(), from_pandas=True)
strings = pa.array(strings, type=pa.string(), mask=mask)
strings = pc.if_else(mask, None, strings)
try:
scalars = strings.cast(pa.int64())
Expand All @@ -372,7 +384,7 @@ def _from_sequence_of_strings(
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
scalars = strings
else:
scalars = pa.array(strings, type=pa.string(), from_pandas=True)
scalars = pa.array(strings, type=pa.string(), mask=mask)
scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
scalars = scalars.cast(pa.bool_())
Expand All @@ -384,6 +396,11 @@ def _from_sequence_of_strings(
from pandas.core.tools.numeric import to_numeric

scalars = to_numeric(strings, errors="raise")
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
scalars = strings.cast(pa_type)
elif mask is not None:
scalars = pa.array(scalars, mask=mask, type=pa_type)

else:
raise NotImplementedError(
f"Converting strings to {pa_type} is not implemented."
Expand Down Expand Up @@ -426,7 +443,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
"""
if isinstance(value, pa.Scalar):
pa_scalar = value
elif isna(value):
elif isna(value) and not lib.is_float(value):
pa_scalar = pa.scalar(None, type=pa_type)
else:
# Workaround https://github.com/apache/arrow/issues/37291
Expand All @@ -443,7 +460,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
value = value.as_unit(pa_type.unit)
value = value._value

pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
pa_scalar = pa.scalar(value, type=pa_type)

if pa_type is not None and pa_scalar.type != pa_type:
pa_scalar = pa_scalar.cast(pa_type)
Expand Down Expand Up @@ -475,6 +492,13 @@ def _box_pa_array(
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

elif hasattr(value, "__arrow_array__"):
# e.g. StringArray
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

else:
if (
isinstance(value, np.ndarray)
Expand Down Expand Up @@ -528,19 +552,25 @@ def _box_pa_array(
pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask)
return pa_array

mask = None
if getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf":
arr_value = np.asarray(value, dtype=object)
# similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
mask = is_pdna_or_none(arr_value)

try:
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(value, type=pa_type, mask=mask)
except (pa.ArrowInvalid, pa.ArrowTypeError):
# GH50430: let pyarrow infer type, then cast
pa_array = pa.array(value, from_pandas=True)
pa_array = pa.array(value, mask=mask)

if pa_type is None and pa.types.is_duration(pa_array.type):
# Workaround https://github.com/apache/arrow/issues/37291
from pandas.core.tools.timedeltas import to_timedelta

value = to_timedelta(value)
value = value.to_numpy()
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(value, type=pa_type)

if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:
# GH52843: upstream bug for duration types when originally
Expand Down Expand Up @@ -1187,7 +1217,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
if not len(values):
return np.zeros(len(self), dtype=bool)

result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
result = pc.is_in(self._pa_array, value_set=pa.array(values))
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand Down Expand Up @@ -1468,7 +1498,11 @@ def to_numpy(
pa.types.is_floating(pa_type)
and (
na_value is np.nan
or (original_na_value is lib.no_default and is_float_dtype(dtype))
or (
original_na_value is lib.no_default
and is_float_dtype(dtype)
and not using_pyarrow_strict_nans()
)
)
):
result = data._pa_array.to_numpy()
Expand Down Expand Up @@ -1994,7 +2028,7 @@ def __setitem__(self, key, value) -> None:
raise ValueError("Length of indexer and values mismatch")
chunks = [
*self._pa_array[:key].chunks,
pa.array([value], type=self._pa_array.type, from_pandas=True),
pa.array([value], type=self._pa_array.type),
*self._pa_array[key + 1 :].chunks,
]
data = pa.chunked_array(chunks).combine_chunks()
Expand Down Expand Up @@ -2048,7 +2082,7 @@ def _rank_calc(
pa_type = pa.float64()
else:
pa_type = pa.uint64()
result = pa.array(ranked, type=pa_type, from_pandas=True)
result = pa.array(ranked, type=pa_type)
return result

data = self._pa_array.combine_chunks()
Expand Down Expand Up @@ -2300,7 +2334,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
right, right_type = _to_numpy_and_type(right)
pa_type = left_type or right_type
result = np.where(cond, left, right)
return pa.array(result, type=pa_type, from_pandas=True)
return pa.array(result, type=pa_type)

@classmethod
def _replace_with_mask(
Expand Down Expand Up @@ -2341,9 +2375,10 @@ def _replace_with_mask(
replacements = np.array(replacements, dtype=object)
elif isinstance(replacements, pa.Scalar):
replacements = replacements.as_py()

result = np.array(values, dtype=object)
result[mask] = replacements
return pa.array(result, type=values.type, from_pandas=True)
return pa.array(result, type=values.type)

# ------------------------------------------------------------------
# GroupBy Methods
Expand Down Expand Up @@ -2422,7 +2457,7 @@ def _groupby_op(
return type(self)(pa_result)
else:
# DatetimeArray, TimedeltaArray
pa_result = pa.array(result, from_pandas=True)
pa_result = pa.array(result)
return type(self)(pa_result)

def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2539,6 +2539,14 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
if result is not NotImplemented:
return result

# TODO: putting this here is hacky as heck
if self.dtype == "float64[pyarrow]":
# e.g. test_log_arrow_backed_missing_value
new_inputs = [
x if x is not self else x.to_numpy(na_value=np.nan) for x in inputs
]
return getattr(ufunc, method)(*new_inputs, **kwargs)

return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs)

def map(self, mapper, na_action: Literal["ignore"] | None = None):
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,9 @@ def to_numpy(
array([ True, False, False])
"""
hasna = self._hasna
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
dtype, na_value = to_numpy_dtype_inference(
self, dtype, na_value, hasna, is_pyarrow=False
)
if dtype is None:
dtype = object

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,12 @@ def _str_map_str_or_object(
if self.dtype.storage == "pyarrow":
import pyarrow as pa

# TODO: shouldn't this already be caught my passed mask?
# it isn't in test_extract_expand_capture_groups_index
# mask = mask | np.array(
# [x is libmissing.NA for x in result], dtype=bool
# )

result = pa.array(
result, mask=mask, type=pa.large_string(), from_pandas=True
)
Expand Down Expand Up @@ -733,7 +739,7 @@ def __arrow_array__(self, type=None):

values = self._ndarray.copy()
values[self.isna()] = None
return pa.array(values, type=type, from_pandas=True)
return pa.array(values, type=type)

def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override]
arr = self._ndarray
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,15 @@ def is_terminal() -> bool:
validator=is_one_of_factory([True, False, "warn"]),
)

with cf.config_prefix("mode"):
cf.register_option(
"pyarrow_strict_nans",
True,
# TODO: Change this to False before merging
"Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA",
validator=is_one_of_factory([True, False]),
)


# user warnings
chained_assignment = """
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9873,7 +9873,7 @@ def where(
def where(
self,
cond,
other=np.nan,
other=lib.no_default,
*,
inplace: bool = False,
axis: Axis | None = None,
Expand Down
14 changes: 14 additions & 0 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,13 @@ def _read_ujson(self) -> DataFrame | Series:
else:
obj = self._get_object_parser(self.data)
if self.dtype_backend is not lib.no_default:
if self.dtype_backend == "pyarrow":
# The construction above takes "null" to NaN, which we want to
# convert to NA. But .convert_dtypes to pyarrow doesn't allow
# that, so we do a 2-step conversion through numpy-nullable.
obj = obj.convert_dtypes(
infer_objects=False, dtype_backend="numpy_nullable"
)
return obj.convert_dtypes(
infer_objects=False, dtype_backend=self.dtype_backend
)
Expand Down Expand Up @@ -1071,6 +1078,13 @@ def __next__(self) -> DataFrame | Series:
raise ex

if self.dtype_backend is not lib.no_default:
if self.dtype_backend == "pyarrow":
# The construction above takes "null" to NaN, which we want to
# convert to NA. But .convert_dtypes to pyarrow doesn't allow
# that, so we do a 2-step conversion through numpy-nullable.
obj = obj.convert_dtypes(
infer_objects=False, dtype_backend="numpy_nullable"
)
return obj.convert_dtypes(
infer_objects=False, dtype_backend=self.dtype_backend
)
Expand Down
Loading
Loading