From 7ba9c71d4d5a576b8496fded07b706110649454b Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 16 Apr 2025 10:25:08 -0400 Subject: [PATCH] ENH(string dtype): fallback for HDF5 with UTF-8 surrogates (#60993) --- pandas/io/pytables.py | 114 ++++++++++++++++++------- pandas/tests/io/pytables/test_store.py | 22 ++--- 2 files changed, 96 insertions(+), 40 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 65f95dab7b42f..c520b8d606ad7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -40,6 +40,7 @@ ) from pandas._libs.lib import is_string_array from pandas._libs.tslibs import timezones +from pandas.compat import HAS_PYARROW from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle from pandas.errors import ( @@ -391,6 +392,13 @@ def read_hdf( DataFrame.to_hdf : Write a HDF file from a DataFrame. HDFStore : Low-level access to HDF files. + Notes + ----- + When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true, + and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding + to UTF-8, the resulting dtype will be + ``pd.StringDtype(storage="python", na_value=np.nan)``. + Examples -------- >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP @@ -2182,6 +2190,20 @@ def convert( # making an Index instance could throw a number of different errors try: new_pd_index = factory(values, **kwargs) + except UnicodeEncodeError as err: + if ( + errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + new_pd_index = factory( + values, + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') @@ -3097,12 +3119,29 @@ def read_index_node( **kwargs, ) else: - index = factory( - _unconvert_index( - data, kind, encoding=self.encoding, errors=self.errors - ), - **kwargs, - ) + try: + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs, + ) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise index.name = name @@ -3236,13 +3275,24 @@ def read( self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) - result = Series(values, index=index, name=self.name, copy=False) - if ( - using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array(values, skipna=True) - ): - result = result.astype(StringDtype(na_value=np.nan)) + try: + result = Series(values, index=index, name=self.name, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + result = Series( + values, + index=index, + name=self.name, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise return result def write(self, obj, **kwargs) -> None: @@ -4704,7 +4754,24 @@ def read( values = values.reshape((1, values.shape[0])) if isinstance(values, np.ndarray): - df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + try: + df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + df = DataFrame( + values.T, + columns=cols_, + index=index_, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise elif isinstance(values, Index): df = DataFrame(values, columns=cols_, index=index_) else: @@ -4714,23 +4781,10 @@ def read( assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) # If str / string dtype is stored in meta, use that. - converted = False for column in cols_: dtype = getattr(self.table.attrs, f"{column}_meta", None) if dtype in ["str", "string"]: df[column] = df[column].astype(dtype) - converted = True - # Otherwise try inference. - if ( - not converted - and using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array( - values, - skipna=True, - ) - ): - df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: @@ -5194,7 +5248,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd # encode if needed if len(data): data = ( - Series(data.ravel(), copy=False) + Series(data.ravel(), copy=False, dtype="object") .str.encode(encoding, errors) ._values.reshape(data.shape) ) @@ -5234,7 +5288,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - ser = Series(data, copy=False).str.decode(encoding, errors=errors) + ser = Series(data, copy=False).str.decode( + encoding, errors=errors, dtype="object" + ) data = ser.to_numpy() data.flags.writeable = True else: diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f51d61e2d633c..93160d9df353b 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,10 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( DataFrame, @@ -398,20 +394,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) -@pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, - reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", -) @pytest.mark.parametrize("format", ["fixed", "table"]) -def test_to_hdf_errors(tmp_path, format, setup_path): +def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string): data = ["\ud800foo"] - ser = Series(data, index=Index(data)) + ser = Series(data, index=Index(data, dtype="object"), dtype="object") path = tmp_path / setup_path # GH 20835 ser.to_hdf(path, key="table", format=format, errors="surrogatepass") result = read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) + + if using_infer_string: + # https://github.com/pandas-dev/pandas/pull/60993 + # Surrogates fallback to python storage. + dtype = pd.StringDtype(storage="python", na_value=np.nan) + else: + dtype = "object" + expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype) + tm.assert_series_equal(result, expected) def test_create_table_index(setup_path):