From 6903f803041062904a0a3ce37b5f031597cbd0b3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 20 Feb 2024 15:50:57 -0600 Subject: [PATCH] Add support for arrow `large_string` in `cudf` (#15093) This PR adds support for `large_string` type of `arrow` arrays in `cudf`. `cudf` strings column lacks 64 bit offset support and it is WIP: https://github.com/rapidsai/cudf/issues/13733 This workaround is essential because `pandas-2.2+` is now defaulting to `large_string` type for arrow-strings instead of `string` type.: https://github.com/pandas-dev/pandas/pull/56220 This PR fixes all 25 `dask-cudf` failures. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/15093 --- python/cudf/cudf/core/column/column.py | 7 +++++++ python/cudf/cudf/tests/test_series.py | 8 ++++++++ python/cudf/cudf/utils/dtypes.py | 2 ++ 3 files changed, 17 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f665d83964c..191c55a8a68 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1920,6 +1920,13 @@ def as_column( return col elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): + if pa.types.is_large_string(arbitrary.type): + # Pandas-2.2+: Pandas defaults to `large_string` type + # instead of `string` without data-introspection. + # Temporary workaround until cudf has native + # support for `LARGE_STRING` i.e., 64 bit offsets + arbitrary = arbitrary.cast(pa.string()) + if pa.types.is_float16(arbitrary.type): raise NotImplementedError( "Type casting from `float16` to `float32` is not " diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 252343391be..caf8947e3b0 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2700,3 +2700,11 @@ def test_series_dtype_astypes(data): result = cudf.Series(data, dtype="float64") expected = cudf.Series([1.0, 2.0, 3.0]) assert_eq(result, expected) + + +def test_series_from_large_string(): + pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string()) + got = cudf.Series(pa_large_string_array) + expected = pd.Series(pa_large_string_array) + + assert_eq(expected, got) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8fa4a230e2c..c8aca94ba19 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -213,6 +213,8 @@ def cudf_dtype_from_pa_type(typ): return cudf.core.dtypes.StructDtype.from_arrow(typ) elif pa.types.is_decimal(typ): return cudf.core.dtypes.Decimal128Dtype.from_arrow(typ) + elif pa.types.is_large_string(typ): + return cudf.dtype("str") else: return cudf.api.types.pandas_dtype(typ.to_pandas_dtype())