Skip to content

Commit

Permalink
Remove cudf.Scalar from factorize (#17897)
Browse files Browse the repository at this point in the history
Toward #17843

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17897
  • Loading branch information
mroeschke authored Feb 4, 2025
1 parent acbcf45 commit 36b0f3a
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 16 deletions.
17 changes: 11 additions & 6 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import cupy as cp
import numpy as np
import pyarrow as pa

import cudf
from cudf.core.column import as_column
from cudf.core.index import Index, RangeIndex
from cudf.core.scalar import Scalar
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column
from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type

if TYPE_CHECKING:
from cudf.core.column.column import ColumnBase
from cudf.core.index import BaseIndex


def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
def factorize(
values,
sort: bool = False,
use_na_sentinel: bool = True,
size_hint: int | None = None,
) -> tuple[cp.ndarray, cp.ndarray | Index]:
"""Encode the input values as integer labels
Parameters
Expand Down Expand Up @@ -96,10 +101,10 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
warnings.warn("size_hint is not applicable for cudf.factorize")

if use_na_sentinel:
na_sentinel = Scalar(-1)
na_sentinel = pa.scalar(-1)
cats = values.dropna()
else:
na_sentinel = Scalar(None, dtype=values.dtype)
na_sentinel = pa.scalar(None, type=cudf_dtype_to_pa_type(values.dtype))
cats = values

cats = cats.unique().astype(values.dtype)
Expand Down
30 changes: 20 additions & 10 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,7 @@ def as_categorical_column(self, dtype) -> ColumnBase:
cats = self.unique().sort_values()
label_dtype = min_unsigned_type(len(cats))
labels = self._label_encoding(
cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)
cats=cats, dtype=label_dtype, na_sentinel=pa.scalar(1)
)
# columns include null index in factorization; remove:
if self.has_nulls():
Expand Down Expand Up @@ -1561,7 +1561,7 @@ def _label_encoding(
self,
cats: ColumnBase,
dtype: Dtype | None = None,
na_sentinel: cudf.Scalar | None = None,
na_sentinel: pa.Scalar | None = None,
):
"""
Convert each value in `self` into an integer code, with `cats`
Expand Down Expand Up @@ -1592,14 +1592,14 @@ def _label_encoding(
]
dtype: int8
"""
if na_sentinel is None or na_sentinel.value is cudf.NA:
na_sentinel = cudf.Scalar(-1)
if na_sentinel is None or not na_sentinel.is_valid:
na_sentinel = pa.scalar(-1)

def _return_sentinel_column():
return as_column(na_sentinel, dtype=dtype, length=len(self))

if dtype is None:
dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
dtype = min_signed_type(max(len(cats), na_sentinel.as_py()), 8)

if is_mixed_with_object_dtype(self, cats):
return _return_sentinel_column()
Expand Down Expand Up @@ -1631,7 +1631,7 @@ def _return_sentinel_column():
(codes,) = sorting.sort_by_key(
[codes], [left_gather_map], [True], ["last"], stable=True
)
return codes.fillna(na_sentinel.value)
return codes.fillna(na_sentinel)

@acquire_spill_lock()
def copy_if_else(
Expand Down Expand Up @@ -2265,11 +2265,21 @@ def as_column(
if dtype is None:
dtype = getattr(arbitrary, "dtype", cudf.dtype("float64"))
arbitrary = None
arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
if length == 0:
return column_empty(length, dtype=arbitrary.dtype)
if isinstance(arbitrary, pa.Scalar):
col = ColumnBase.from_pylibcudf(
plc.Column.from_scalar(
pa_scalar_to_plc_scalar(arbitrary), length
)
)
if dtype is not None:
col = col.astype(dtype)
return col
else:
return ColumnBase.from_scalar(arbitrary, length)
arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
if length == 0:
return column_empty(length, dtype=arbitrary.dtype)
else:
return ColumnBase.from_scalar(arbitrary, length)

elif hasattr(arbitrary, "__array_interface__"):
desc = arbitrary.__array_interface__
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from cudf.core.missing import NA, NaT
from cudf.core.mixins import BinaryOperand
from cudf.utils.dtypes import (
cudf_dtype_from_pa_type,
get_allowed_combinations_for_operator,
to_cudf_compatible_scalar,
)
Expand Down Expand Up @@ -76,6 +77,13 @@ def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]:
else:
return NA, dtype

if isinstance(value, pa.Scalar):
# TODO: Avoid converting to a Python scalar since we
# end up converting pyarrow.Scalars to pylibcudf.Scalars
if dtype is None:
dtype = cudf_dtype_from_pa_type(value.type)
return value.as_py(), dtype

if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
value = pa.scalar(
value, type=pa.decimal128(dtype.precision, dtype.scale)
Expand Down

0 comments on commit 36b0f3a

Please sign in to comment.