Remove cudf.Scalar from factorize (#17897)

Toward #17843 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #17897
rapidsai · Feb 4, 2025 · 36b0f3a · 36b0f3a
1 parent acbcf45
commit 36b0f3a
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 16 deletions.
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
@@ -1,25 +1,30 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import warnings
 from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
+import pyarrow as pa
 
 import cudf
 from cudf.core.column import as_column
 from cudf.core.index import Index, RangeIndex
-from cudf.core.scalar import Scalar
 from cudf.options import get_option
-from cudf.utils.dtypes import can_convert_to_column
+from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type
 
 if TYPE_CHECKING:
     from cudf.core.column.column import ColumnBase
     from cudf.core.index import BaseIndex
 
 
-def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
+def factorize(
+    values,
+    sort: bool = False,
+    use_na_sentinel: bool = True,
+    size_hint: int | None = None,
+) -> tuple[cp.ndarray, cp.ndarray | Index]:
     """Encode the input values as integer labels
 
     Parameters
@@ -96,10 +101,10 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         warnings.warn("size_hint is not applicable for cudf.factorize")
 
     if use_na_sentinel:
-        na_sentinel = Scalar(-1)
+        na_sentinel = pa.scalar(-1)
         cats = values.dropna()
     else:
-        na_sentinel = Scalar(None, dtype=values.dtype)
+        na_sentinel = pa.scalar(None, type=cudf_dtype_to_pa_type(values.dtype))
         cats = values
 
     cats = cats.unique().astype(values.dtype)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1250,7 +1250,7 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         cats = self.unique().sort_values()
         label_dtype = min_unsigned_type(len(cats))
         labels = self._label_encoding(
-            cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)
+            cats=cats, dtype=label_dtype, na_sentinel=pa.scalar(1)
         )
         # columns include null index in factorization; remove:
         if self.has_nulls():
@@ -1561,7 +1561,7 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: cudf.Scalar | None = None,
+        na_sentinel: pa.Scalar | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1592,14 +1592,14 @@ def _label_encoding(
         ]
         dtype: int8
         """
-        if na_sentinel is None or na_sentinel.value is cudf.NA:
-            na_sentinel = cudf.Scalar(-1)
+        if na_sentinel is None or not na_sentinel.is_valid:
+            na_sentinel = pa.scalar(-1)
 
         def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
-            dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
+            dtype = min_signed_type(max(len(cats), na_sentinel.as_py()), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
@@ -1631,7 +1631,7 @@ def _return_sentinel_column():
         (codes,) = sorting.sort_by_key(
             [codes], [left_gather_map], [True], ["last"], stable=True
         )
-        return codes.fillna(na_sentinel.value)
+        return codes.fillna(na_sentinel)
 
     @acquire_spill_lock()
     def copy_if_else(
@@ -2265,11 +2265,21 @@ def as_column(
             if dtype is None:
                 dtype = getattr(arbitrary, "dtype", cudf.dtype("float64"))
             arbitrary = None
-        arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
-        if length == 0:
-            return column_empty(length, dtype=arbitrary.dtype)
+        if isinstance(arbitrary, pa.Scalar):
+            col = ColumnBase.from_pylibcudf(
+                plc.Column.from_scalar(
+                    pa_scalar_to_plc_scalar(arbitrary), length
+                )
+            )
+            if dtype is not None:
+                col = col.astype(dtype)
+            return col
         else:
-            return ColumnBase.from_scalar(arbitrary, length)
+            arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
+            if length == 0:
+                return column_empty(length, dtype=arbitrary.dtype)
+            else:
+                return ColumnBase.from_scalar(arbitrary, length)
 
     elif hasattr(arbitrary, "__array_interface__"):
         desc = arbitrary.__array_interface__

diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
@@ -25,6 +25,7 @@
 from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
 from cudf.utils.dtypes import (
+    cudf_dtype_from_pa_type,
     get_allowed_combinations_for_operator,
     to_cudf_compatible_scalar,
 )
@@ -76,6 +77,13 @@ def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]:
         else:
             return NA, dtype
 
+    if isinstance(value, pa.Scalar):
+        # TODO: Avoid converting to a Python scalar since we
+        # end up converting pyarrow.Scalars to pylibcudf.Scalars
+        if dtype is None:
+            dtype = cudf_dtype_from_pa_type(value.type)
+        return value.as_py(), dtype
+
     if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
         value = pa.scalar(
             value, type=pa.decimal128(dtype.precision, dtype.scale)