diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 1e7d66dfeb142..2282f38d8b4ce 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -77,7 +77,7 @@ By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it .. ipython:: python raw_cat = pd.Categorical( - ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False + [None, "b", "c", None], categories=["b", "c", "d"], ordered=False ) s = pd.Series(raw_cat) s @@ -145,7 +145,7 @@ of :class:`~pandas.api.types.CategoricalDtype`. from pandas.api.types import CategoricalDtype - s = pd.Series(["a", "b", "c", "a"]) + s = pd.Series([None, "b", "c", None]) cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True) s_cat = s.astype(cat_type) s_cat diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 52038ad4b66c1..2efff35fa2c57 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -499,11 +499,14 @@ When using ``dtype=CategoricalDtype``, "unexpected" values outside of ``dtype.categories`` are treated as missing values. .. ipython:: python + :okwarning: dtype = CategoricalDtype(["a", "b", "d"]) # No 'c' pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1 -This matches the behavior of :meth:`Categorical.set_categories`. +This matches the behavior of :meth:`Categorical.set_categories`. This behavior is +deprecated. In a future version, the presence of non-NA values that are not +among the specified categories will raise. .. note:: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d519400834ee1..287742546dba3 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -611,6 +611,7 @@ Other Deprecations - Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`) - Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`) - Deprecated ``pd.core.internals.api.maybe_infer_ndim`` (:issue:`40226`) +- Deprecated allowing constructing or casting to :class:`Categorical` with non-NA values that are not present in specified ``dtype.categories`` (:issue:`40996`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 78928713166f4..444be23ca9c7d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -11,6 +11,7 @@ cast, overload, ) +import warnings import numpy as np @@ -23,6 +24,7 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas.compat.numpy import function as nv +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -478,7 +480,11 @@ def __init__( elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes codes = recode_for_categories( - old_codes, values.dtype.categories, dtype.categories, copy=copy + old_codes, + values.dtype.categories, + dtype.categories, + copy=copy, + warn=True, ) else: @@ -530,7 +536,12 @@ def _from_sequence( def _cast_pointwise_result(self, values) -> ArrayLike: res = super()._cast_pointwise_result(values) - cat = type(self)._from_sequence(res, dtype=self.dtype) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Constructing a Categorical with a dtype and values containing", + ) + cat = type(self)._from_sequence(res, dtype=self.dtype) if (cat.isna() == isna(res)).all(): # i.e. the conversion was non-lossy return cat @@ -567,6 +578,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self result = self._set_dtype(dtype, copy=False) + wrong = result.isna() & ~self.isna() + if wrong.any(): + warnings.warn( + "Constructing a Categorical with a dtype and values containing " + "non-null entries not in that dtype's categories is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) @@ -661,14 +681,16 @@ def _from_inferred_categories( if known_categories: # Recode from observation order to dtype.categories order. categories = dtype.categories - codes = recode_for_categories(inferred_codes, cats, categories, copy=False) + codes = recode_for_categories( + inferred_codes, cats, categories, copy=False, warn=True + ) elif not cats.is_monotonic_increasing: # Sort categories and recode for unknown categories. unsorted = cats.copy() categories = cats.sort_values() codes = recode_for_categories( - inferred_codes, unsorted, categories, copy=False + inferred_codes, unsorted, categories, copy=False, warn=True ) dtype = CategoricalDtype(categories, ordered=False) else: @@ -789,7 +811,7 @@ def categories(self) -> Index: >>> ser.cat.categories Index(['a', 'b', 'c'], dtype='str') - >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"]) + >>> raw_cat = pd.Categorical([None, "b", "c", None], categories=["b", "c", "d"]) >>> ser = pd.Series(raw_cat) >>> ser.cat.categories Index(['b', 'c', 'd'], dtype='str') @@ -1097,7 +1119,7 @@ def set_categories( For :class:`pandas.Series`: >>> raw_cat = pd.Categorical( - ... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True + ... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True ... ) >>> ser = pd.Series(raw_cat) >>> ser @@ -1119,7 +1141,7 @@ def set_categories( For :class:`pandas.CategoricalIndex`: >>> ci = pd.CategoricalIndex( - ... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True + ... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True ... ) >>> ci CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'], @@ -1147,7 +1169,7 @@ def set_categories( codes = cat._codes else: codes = recode_for_categories( - cat.codes, cat.categories, new_dtype.categories, copy=False + cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False ) NDArrayBacked.__init__(cat, codes, new_dtype) return cat @@ -2960,7 +2982,7 @@ def codes(self) -> Series: Examples -------- - >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"]) + >>> raw_cate = pd.Categorical(["a", "b", None, "a"], categories=["a", "b"]) >>> ser = pd.Series(raw_cate) >>> ser.cat.codes 0 0 @@ -2995,11 +3017,25 @@ def _get_codes_for_values( If `values` is known to be a Categorical, use recode_for_categories instead. """ codes = categories.get_indexer_for(values) + wrong = (codes == -1) & ~isna(values) + if wrong.any(): + warnings.warn( + "Constructing a Categorical with a dtype and values containing " + "non-null entries not in that dtype's categories is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return coerce_indexer_dtype(codes, categories) def recode_for_categories( - codes: np.ndarray, old_categories, new_categories, *, copy: bool + codes: np.ndarray, + old_categories, + new_categories, + *, + copy: bool = True, + warn: bool = False, ) -> np.ndarray: """ Convert a set of codes for to a new set of categories @@ -3010,6 +3046,8 @@ def recode_for_categories( old_categories, new_categories : Index copy: bool, default True Whether to copy if the codes are unchanged. + warn : bool, default False + Whether to warn on silent-NA mapping. Returns ------- @@ -3034,9 +3072,18 @@ def recode_for_categories( return codes.copy() return codes - indexer = coerce_indexer_dtype( - new_categories.get_indexer_for(old_categories), new_categories - ) + codes_in_old_cats = new_categories.get_indexer_for(old_categories) + if warn: + wrong = codes_in_old_cats == -1 + if wrong.any(): + warnings.warn( + "Constructing a Categorical with a dtype and values containing " + "non-null entries not in that dtype's categories is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories) new_codes = take_nd(indexer, codes, fill_value=-1) return new_codes diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index eb5c7739e5132..8cde865b46738 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -203,7 +203,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): Examples -------- >>> t = pd.CategoricalDtype(categories=["b", "a"], ordered=True) - >>> pd.Series(["a", "b", "a", "c"], dtype=t) + >>> pd.Series(["a", "b", "a", None], dtype=t) 0 a 1 b 2 a diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index eab221e4df2a9..efe5b1f4f7b42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -718,7 +718,7 @@ def groups(self) -> dict[Hashable, Index]: return self.groupings[0].groups result_index, ids = self.result_index_and_ids values = result_index._values - categories = Categorical(ids, categories=range(len(result_index))) + categories = Categorical.from_codes(ids, categories=range(len(result_index))) result = { # mypy is not aware that group has to be an integer values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload] diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 92081d7c71236..bf4dd5a649ffe 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -22,7 +22,6 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, - isna, ) from pandas.core.arrays.categorical import ( @@ -258,6 +257,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical: else: values = other + codes = self.categories.get_indexer(values) + if ((codes == -1) & ~values.isna()).any(): + # GH#37667 see test_equals_non_category + raise TypeError( + "categories must match existing categories when appending" + ) cat = Categorical(other, dtype=self.dtype) other = CategoricalIndex(cat) if not other.isin(values).all(): @@ -266,12 +271,6 @@ def _is_dtype_compat(self, other: Index) -> Categorical: ) cat = other._values - if not ((cat == values) | (isna(cat) & isna(values))).all(): - # GH#37667 see test_equals_non_category - raise TypeError( - "categories must match existing categories when appending" - ) - return cat def equals(self, other: object) -> bool: diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index efccb129ac6ef..d2d56e5232528 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -289,8 +289,16 @@ def test_set_categories(self): ], ) def test_set_categories_many(self, values, categories, new_categories, ordered): - c = Categorical(values, categories) - expected = Categorical(values, new_categories, ordered) + msg = "Constructing a Categorical with a dtype and values containing" + + warn1 = FutureWarning if set(values).difference(categories) else None + with tm.assert_produces_warning(warn1, match=msg): + c = Categorical(values, categories) + + warn2 = FutureWarning if set(values).difference(new_categories) else None + with tm.assert_produces_warning(warn2, match=msg): + expected = Categorical(values, new_categories, ordered) + result = c.set_categories(new_categories, ordered=ordered) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 7ed4da69f5a99..3bc72e475d012 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -121,8 +121,11 @@ def test_astype_category(self, dtype_ordered, ordered): # non-standard categories dtype = CategoricalDtype(list("adc"), dtype_ordered) - result = cat.astype(dtype) - expected = Categorical(data, dtype=dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cat.astype(dtype) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = Categorical(data, dtype=dtype) tm.assert_categorical_equal(result, expected) if dtype_ordered is False: diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index cf2de894cc0c0..b80514455db67 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -228,14 +228,15 @@ def test_constructor(self): # two arrays # - when the first is an integer dtype and the second is not # - when the resulting codes are all -1/NaN - with tm.assert_produces_warning(None): + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(FutureWarning, match=msg): Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) # the next one are from the old docs - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(FutureWarning, match=msg): Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) cat = Categorical([1, 2], categories=[1, 2, 3]) @@ -247,12 +248,16 @@ def test_constructor_with_existing_categories(self): # GH25318: constructing with pd.Series used to bogusly skip recoding # categories c0 = Categorical(["a", "b", "c", "a"]) - c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"]) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"]) - c2 = Categorical(c0, categories=c1.categories) + with tm.assert_produces_warning(FutureWarning, match=msg): + c2 = Categorical(c0, categories=c1.categories) tm.assert_categorical_equal(c1, c2) - c3 = Categorical(Series(c0), categories=c1.categories) + with tm.assert_produces_warning(FutureWarning, match=msg): + c3 = Categorical(Series(c0), categories=c1.categories) tm.assert_categorical_equal(c1, c3) def test_constructor_not_sequence(self): @@ -430,10 +435,13 @@ def test_constructor_dtype_and_others_raises(self): @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]]) def test_constructor_str_category(self, categories, ordered): - result = Categorical( - ["a", "b"], categories=categories, ordered=ordered, dtype="category" - ) - expected = Categorical(["a", "b"], categories=categories, ordered=ordered) + warn = FutureWarning if categories == ["a", "c"] else None + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(warn, match=msg): + result = Categorical( + ["a", "b"], categories=categories, ordered=ordered, dtype="category" + ) + expected = Categorical(["a", "b"], categories=categories, ordered=ordered) tm.assert_categorical_equal(result, expected) def test_constructor_str_unknown(self): @@ -450,10 +458,12 @@ def test_constructor_np_strs(self): def test_constructor_from_categorical_with_dtype(self): dtype = CategoricalDtype(["a", "b", "c"], ordered=True) values = Categorical(["a", "b", "d"]) - result = Categorical(values, dtype=dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Categorical(values, dtype=dtype) # We use dtype.categories, not values.categories expected = Categorical( - ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ["a", "b", None], categories=["a", "b", "c"], ordered=True ) tm.assert_categorical_equal(result, expected) @@ -470,16 +480,19 @@ def test_constructor_from_categorical_with_unknown_dtype(self): def test_constructor_from_categorical_string(self): values = Categorical(["a", "b", "d"]) # use categories, ordered - result = Categorical( - values, categories=["a", "b", "c"], ordered=True, dtype="category" - ) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Categorical( + values, categories=["a", "b", "c"], ordered=True, dtype="category" + ) expected = Categorical( - ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ["a", "b", None], categories=["a", "b", "c"], ordered=True ) tm.assert_categorical_equal(result, expected) # No string - result = Categorical(values, categories=["a", "b", "c"], ordered=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Categorical(values, categories=["a", "b", "c"], ordered=True) tm.assert_categorical_equal(result, expected) def test_constructor_with_categorical_categories(self): @@ -661,9 +674,13 @@ def test_from_inferred_categories_dtype(self): cats = ["a", "b", "d"] codes = np.array([0, 1, 0, 2], dtype="i8") dtype = CategoricalDtype(["c", "b", "a"], ordered=True) - result = Categorical._from_inferred_categories(cats, codes, dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical( - ["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True + ["a", "b", "a", None], categories=["c", "b", "a"], ordered=True ) tm.assert_categorical_equal(result, expected) @@ -671,7 +688,11 @@ def test_from_inferred_categories_coerces(self): cats = ["1", "2", "bad"] codes = np.array([0, 0, 1, 2], dtype="i8") dtype = CategoricalDtype([1, 2]) - result = Categorical._from_inferred_categories(cats, codes, dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical([1, 1, 2, np.nan]) tm.assert_categorical_equal(result, expected) @@ -722,7 +743,9 @@ def test_interval(self): # extra values = pd.interval_range(8, 11, periods=3) - cat = Categorical(values, categories=idx) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + cat = Categorical(values, categories=idx) expected_codes = np.array([8, 9, -1], dtype="int8") tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index daacf4c69a8a9..585e964e5c8b2 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -59,38 +59,45 @@ def test_set_dtype_new_categories(self): tm.assert_index_equal(result.dtype.categories, Index(list("abcd"))) @pytest.mark.parametrize( - "values, categories, new_categories", + "values, categories, new_categories, warn", [ # No NaNs, same cats, same order - (["a", "b", "a"], ["a", "b"], ["a", "b"]), + (["a", "b", "a"], ["a", "b"], ["a", "b"], None), # No NaNs, same cats, different order - (["a", "b", "a"], ["a", "b"], ["b", "a"]), + (["a", "b", "a"], ["a", "b"], ["b", "a"], None), # Same, unsorted - (["b", "a", "a"], ["a", "b"], ["a", "b"]), + (["b", "a", "a"], ["a", "b"], ["a", "b"], None), # No NaNs, same cats, different order - (["b", "a", "a"], ["a", "b"], ["b", "a"]), + (["b", "a", "a"], ["a", "b"], ["b", "a"], None), # NaNs - (["a", "b", "c"], ["a", "b"], ["a", "b"]), - (["a", "b", "c"], ["a", "b"], ["b", "a"]), - (["b", "a", "c"], ["a", "b"], ["a", "b"]), - (["b", "a", "c"], ["a", "b"], ["b", "a"]), + (["a", "b", "c"], ["a", "b"], ["a", "b"], None), + (["a", "b", "c"], ["a", "b"], ["b", "a"], None), + (["b", "a", "c"], ["a", "b"], ["a", "b"], None), + (["b", "a", "c"], ["a", "b"], ["b", "a"], None), # Introduce NaNs - (["a", "b", "c"], ["a", "b"], ["a"]), - (["a", "b", "c"], ["a", "b"], ["b"]), - (["b", "a", "c"], ["a", "b"], ["a"]), - (["b", "a", "c"], ["a", "b"], ["b"]), + (["a", "b", "c"], ["a", "b"], ["a"], FutureWarning), + (["a", "b", "c"], ["a", "b"], ["b"], FutureWarning), + (["b", "a", "c"], ["a", "b"], ["a"], FutureWarning), + (["b", "a", "c"], ["a", "b"], ["b"], FutureWarning), # No overlap - (["a", "b", "c"], ["a", "b"], ["d", "e"]), + (["a", "b", "c"], ["a", "b"], ["d", "e"], FutureWarning), ], ) - def test_set_dtype_many(self, values, categories, new_categories, ordered): - c = Categorical(values, categories) - expected = Categorical(values, new_categories, ordered) + def test_set_dtype_many(self, values, categories, new_categories, warn, ordered): + msg = "Constructing a Categorical with a dtype and values containing" + warn1 = FutureWarning if set(values).difference(categories) else None + with tm.assert_produces_warning(warn1, match=msg): + c = Categorical(values, categories) + warn2 = FutureWarning if set(values).difference(new_categories) else None + with tm.assert_produces_warning(warn2, match=msg): + expected = Categorical(values, new_categories, ordered) result = c._set_dtype(expected.dtype, copy=True) tm.assert_categorical_equal(result, expected) def test_set_dtype_no_overlap(self): - c = Categorical(["a", "b", "c"], ["d", "e"]) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + c = Categorical(["a", "b", "c"], ["d", "e"]) result = c._set_dtype(CategoricalDtype(["a", "b"]), copy=True) expected = Categorical([None, None, None], categories=["a", "b"]) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 33c55b2090bd6..994e7481d4132 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -223,7 +223,7 @@ def test_categories_assignments_wrong_length_raises(self, new_categories): @pytest.mark.parametrize("dtype", [None, "category", "key"]) def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype): # GH 21448 - key = key_class(key_values, categories=range(1, 5)) + key = key_class(key_values, categories=range(1, 6)) if dtype == "key": dtype = key.dtype diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 541b271098152..9e4136f05de6a 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -29,8 +29,9 @@ def test_na_flags_int_categories(self): categories = list(range(10)) labels = np.random.default_rng(2).integers(0, 10, 20) labels[::5] = -1 - - cat = Categorical(labels, categories) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + cat = Categorical(labels, categories) repr(cat) tm.assert_numpy_array_equal(isna(cat), labels == -1) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index c9d3f83ce9237..eebfe39e89784 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1,4 +1,5 @@ import re +import warnings import weakref import numpy as np @@ -121,7 +122,9 @@ def test_constructor_invalid(self): dtype1 = CategoricalDtype(["a", "b"], ordered=True) dtype2 = CategoricalDtype(["x", "y"], ordered=False) - c = Categorical([0, 1], dtype=dtype1) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + c = Categorical([0, 1], dtype=dtype1) @pytest.mark.parametrize( "values, categories, ordered, dtype, expected", diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b83a09e7f2e18..76c8f094fe389 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1676,7 +1676,7 @@ def test_categorical(self): result = lib.infer_dtype(Series(arr), skipna=True) assert result == "categorical" - arr = Categorical(list("abc"), categories=["cegfab"], ordered=True) + arr = Categorical([None, None, None], categories=["cegfab"], ordered=True) result = lib.infer_dtype(arr, skipna=True) assert result == "categorical" diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 59cc0eab2f62e..22cf8d7e1c22f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -304,6 +304,9 @@ def test_astype_duplicate_col_series_arg(self): ], ids=repr, ) + @pytest.mark.filterwarnings( + "ignore:Constructing a Categorical with a dtype and values:FutureWarning" + ) def test_astype_categorical(self, dtype): # GH#18099 d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 33510abac6ab6..cc1b3fd48c244 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -255,8 +255,8 @@ def test_unstack_fill_frame_categorical(self): result = data.unstack() expected = DataFrame( { - "a": pd.Categorical(list("axa"), categories=list("abc")), - "b": pd.Categorical(list("bcx"), categories=list("abc")), + "a": pd.Categorical(["a", None, "a"], categories=list("abc")), + "b": pd.Categorical(["b", "c", None], categories=list("abc")), }, index=list("xyz"), ) diff --git a/pandas/tests/indexes/categorical/test_astype.py b/pandas/tests/indexes/categorical/test_astype.py index a17627b7515b2..0fa1e66ed8691 100644 --- a/pandas/tests/indexes/categorical/test_astype.py +++ b/pandas/tests/indexes/categorical/test_astype.py @@ -63,8 +63,11 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): # non-standard categories dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) - result = index.astype(dtype) - expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = index.astype(dtype) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) tm.assert_index_equal(result, expected) if dtype_ordered is False: diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 58e4649cb331b..a9eaf0f88f385 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -124,11 +124,11 @@ def test_has_duplicates(self): assert idx.is_unique is False assert idx.has_duplicates is True - idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo") + idx = CategoricalIndex([None, None], categories=[2, 3], name="foo") assert idx.is_unique is False assert idx.has_duplicates is True - idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo") + idx = CategoricalIndex([None, 1, 2, 3], categories=[1, 2, 3], name="foo") assert idx.is_unique is True assert idx.has_duplicates is False @@ -145,7 +145,7 @@ def test_has_duplicates(self): }, ), ( - [1, 1, 1], + [None, None, None], list("abc"), { "first": np.array([False, True, True]), @@ -154,7 +154,7 @@ def test_has_duplicates(self): }, ), ( - [2, "a", "b"], + [None, "a", "b"], list("abc"), { "first": np.zeros(shape=(3), dtype=np.bool_), @@ -193,8 +193,11 @@ def test_drop_duplicates(self, data, categories, expected): def test_unique(self, data, categories, expected_data, ordered): dtype = CategoricalDtype(categories, ordered=ordered) - idx = CategoricalIndex(data, dtype=dtype) - expected = CategoricalIndex(expected_data, dtype=dtype) + msg = "Constructing a Categorical with a dtype and values containing" + warn = None if expected_data == [1] else FutureWarning + with tm.assert_produces_warning(warn, match=msg): + idx = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) def test_repr_roundtrip(self): diff --git a/pandas/tests/indexes/categorical/test_constructors.py b/pandas/tests/indexes/categorical/test_constructors.py index f0c5307fc5c64..5c1476dc368fb 100644 --- a/pandas/tests/indexes/categorical/test_constructors.py +++ b/pandas/tests/indexes/categorical/test_constructors.py @@ -66,24 +66,29 @@ def test_construction(self): ) assert not result.ordered - result = CategoricalIndex(ci, categories=list("ab")) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = CategoricalIndex(ci, categories=list("ab")) tm.assert_index_equal(result.categories, Index(list("ab"))) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") ) assert not result.ordered - result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) tm.assert_index_equal(result.categories, Index(list("ab"))) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") ) assert result.ordered - result = CategoricalIndex(ci, categories=list("ab"), ordered=True) - expected = CategoricalIndex( - ci, categories=list("ab"), ordered=True, dtype="category" - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = CategoricalIndex( + ci, categories=list("ab"), ordered=True, dtype="category" + ) tm.assert_index_equal(result, expected, exact=True) # turn me to an Index diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index a8353f301a3c3..6d398a3def19d 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -68,7 +68,7 @@ def test_equals_categorical_unordered(self): # https://github.com/pandas-dev/pandas/issues/16603 a = CategoricalIndex(["A"], categories=["A", "B"]) b = CategoricalIndex(["A"], categories=["B", "A"]) - c = CategoricalIndex(["C"], categories=["B", "A"]) + c = CategoricalIndex([None], categories=["B", "A"]) assert a.equals(b) assert not a.equals(c) assert not b.equals(c) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index dde5f38074efb..a28ea505a2af7 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -47,8 +47,13 @@ def test_astype_category(self, index): # non-default params categories = index.dropna().unique().values[:-1] dtype = CategoricalDtype(categories=categories, ordered=True) - result = index.astype(dtype) - expected = CategoricalIndex(index.values, categories=categories, ordered=True) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = index.astype(dtype) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = CategoricalIndex( + index.values, categories=categories, ordered=True + ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 3ba19b2a4b254..c2d1b0ff3f024 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -719,8 +719,11 @@ def test_astype_category(self, copy, name, ordered, simple_index): # non-standard categories dtype = CategoricalDtype(idx.unique().tolist()[:-1], ordered) - result = idx.astype(dtype, copy=copy) - expected = CategoricalIndex(idx, name=name, dtype=dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = idx.astype(dtype, copy=copy) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = CategoricalIndex(idx, name=name, dtype=dtype) tm.assert_index_equal(result, expected, exact=True) if ordered is False: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index bf746a9eaa976..a61a4f6ea651e 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -62,6 +62,9 @@ def test_categorical_pyarrow(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Constructing a Categorical with a dtype and values containing:FutureWarning" +) def test_empty_categorical_pyarrow(): # https://github.com/pandas-dev/pandas/issues/53077 pa = pytest.importorskip("pyarrow", "11.0.0") diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 15cbac54ff8d9..323cb58fac063 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -329,7 +329,9 @@ def test_categorical_unexpected_categories(all_parsers): dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) + expected = DataFrame({"b": Categorical(["d", "a", None, "d"], dtype=dtype["b"])}) - result = parser.read_csv(StringIO(data), dtype=dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4fe3a97cb2386..8d9045c383f9e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -798,7 +798,7 @@ def test_categorical(self, pa): ), # test for ordered flag "c": pd.Categorical( - ["a", "b", "c", "a", "c", "b"], + [None, "b", "c", None, "c", "b"], categories=["b", "c", "d"], ordered=True, ), diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 8e6a14e6bfb8f..ac621901f60fb 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -141,9 +141,11 @@ def test_categorical_index_preserver(self): tm.assert_frame_equal(result, expected) # wrong categories -> uses concat_compat, which casts to object - df3 = DataFrame( - {"A": a, "B": Categorical(b, categories=list("abe"))} - ).set_index("B") + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + df3 = DataFrame( + {"A": a, "B": Categorical(b, categories=list("abe"))} + ).set_index("B") result = pd.concat([df2, df3]) expected = pd.concat( [ diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4a7e204ee4161..e187803455b4f 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -624,8 +624,11 @@ def test_astype_categorical_to_categorical( # different categories dtype = CategoricalDtype(list("adc"), dtype_ordered) - result = ser.astype(dtype) - expected = Series(s_data, name=name, dtype=dtype) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.astype(dtype) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) if dtype_ordered is False: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 6d991235958af..d81c1f820d856 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -392,7 +392,9 @@ def test_constructor_map(self): tm.assert_series_equal(result, exp) def test_constructor_categorical(self): - cat = Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"]) + msg = "Constructing a Categorical with a dtype and values containing" + with tm.assert_produces_warning(FutureWarning, match=msg): + cat = Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"]) res = Series(cat) tm.assert_categorical_equal(res.values, cat) @@ -536,7 +538,7 @@ def test_categorical_sideeffects_free(self): tm.assert_numpy_array_equal(s.__array__(), exp_s2) def test_unordered_compare_equal(self): - left = Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) + left = Series(["a", "b", None], dtype=CategoricalDtype(["a", "b"])) right = Series(Categorical(["a", "b", np.nan], categories=["a", "b"])) tm.assert_series_equal(left, right)