diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5ff1ea9d194f6..c8e2c5584f79b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -909,6 +909,7 @@ Other - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`) - Bug in :meth:`Series.describe` where median percentile was always included when the ``percentiles`` argument was passed (:issue:`60550`). +- Bug in :meth:`Series.describe` where statistics with multiple dtypes for ExtensionArrays were coerced to ``float64`` which raised a ``DimensionalityError``` (:issue:`61707`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 944e28a9b0238..4d291c0edaa90 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -12,6 +12,7 @@ ) from typing import ( TYPE_CHECKING, + Any, cast, ) @@ -215,6 +216,14 @@ def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]: return names +def has_multiple_internal_dtypes(d: list[Any]) -> bool: + """Check if the sequence has multiple internal dtypes.""" + if not d: + return False + + return any(type(item) != type(d[0]) for item in d) + + def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: """Describe series containing numerical data. @@ -251,6 +260,10 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: import pyarrow as pa dtype = ArrowDtype(pa.float64()) + elif has_multiple_internal_dtypes(d): + # GH61707: describe() doesn't work on EAs + # with multiple internal dtypes, so return object dtype + dtype = None else: dtype = Float64Dtype() elif series.dtype.kind in "iufb": diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index 79ec11feb5308..35f126103d3f2 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -95,6 +95,32 @@ def test_describe_empty_object(self): assert np.isnan(result.iloc[2]) assert np.isnan(result.iloc[3]) + def test_describe_multiple_dtypes(self): + """ + GH61707: describe() doesn't work on EAs which generate + statistics with multiple dtypes. + """ + from decimal import Decimal + + from pandas.tests.extension.decimal import to_decimal + + s = Series(to_decimal([1, 2.5, 3]), dtype="decimal") + + expected = Series( + [ + 3, + Decimal("2.166666666666666666666666667"), + Decimal("0.8498365855987974716713706849"), + Decimal("1"), + Decimal("3"), + ], + index=["count", "mean", "std", "min", "max"], + dtype="object", + ) + + result = s.describe(percentiles=[]) + tm.assert_series_equal(result, expected) + def test_describe_with_tz(self, tz_naive_fixture): # GH 21332 tz = tz_naive_fixture