Skip to content

Commit

Permalink
Switch to using gpu_struct decorator
Browse files Browse the repository at this point in the history
  • Loading branch information
shwina committed Jan 7, 2025
1 parent cf6f679 commit 33b6a96
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 105 deletions.
7 changes: 6 additions & 1 deletion python/cuda_parallel/cuda/parallel/experimental/_cccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,4 +215,9 @@ def to_cccl_iter(array_or_iterator) -> Iterator:

def host_array_to_value(array: np.ndarray) -> Value:
info = _numpy_type_to_info(array.dtype)
return Value(info, array.ctypes.data)
if isinstance(array, np.ndarray):
data = array.ctypes.data
else:
# it's a gpudataclass:
data = ctypes.cast(ctypes.pointer(array._data), ctypes.c_void_p)
return Value(info, data)
86 changes: 0 additions & 86 deletions python/cuda_parallel/cuda/parallel/experimental/_structwrapper.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,14 @@
from .. import _cccl as cccl
from .._bindings import get_bindings, get_paths
from .._caching import CachableFunction, cache_with_key
from .._structwrapper import wrap_struct
from .._utils import cai
from ..iterators._iterators import IteratorBase
from ..typing import DeviceArrayLike
from ..typing import DeviceArrayLike, GpuStruct


class _Op:
def __init__(self, dtype: np.dtype, op: Callable):
# if h_init is a struct, wrap it in a Record type:
if dtype.names is not None:
value_type = wrap_struct(dtype)
else:
value_type = numba.from_dtype(dtype)
def __init__(self, h_init: np.ndarray | GpuStruct, op: Callable):
value_type = numba.typeof(h_init)
self.ltoir, _ = cuda.compile(op, sig=(value_type, value_type), output="ltoir")
self.name = op.__name__.encode("utf-8")

Expand Down Expand Up @@ -56,7 +51,7 @@ def __init__(
d_in: DeviceArrayLike | IteratorBase,
d_out: DeviceArrayLike,
op: Callable,
h_init: np.ndarray,
h_init: np.ndarray | GpuStruct,
):
d_in_cccl = cccl.to_cccl_iter(d_in)
self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
Expand All @@ -67,11 +62,10 @@ def __init__(
cc_major, cc_minor = cuda.get_current_device().compute_capability
cub_path, thrust_path, libcudacxx_path, cuda_include_path = get_paths()
bindings = get_bindings()
self.op_wrapper = _Op(h_init.dtype, op)
self.op_wrapper = _Op(h_init, op)
d_out_cccl = cccl.to_cccl_iter(d_out)
self.build_result = cccl.DeviceReduceBuildResult()

# TODO Figure out caching
error = bindings.cccl_device_reduce_build(
ctypes.byref(self.build_result),
d_in_cccl,
Expand All @@ -88,7 +82,9 @@ def __init__(
if error != enums.CUDA_SUCCESS:
raise ValueError("Error building reduce")

def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray):
def __call__(
self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray | GpuStruct
):
d_in_cccl = cccl.to_cccl_iter(d_in)
if d_in_cccl.type.value == cccl.IteratorKind.ITERATOR:
assert num_items is not None
Expand Down
99 changes: 99 additions & 0 deletions python/cuda_parallel/cuda/parallel/experimental/gpu_struct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from dataclasses import dataclass
from dataclasses import fields as dataclass_fields

import numba
import numpy as np
from numba.core import cgutils
from numba.core.extending import (
make_attribute_wrapper,
models,
register_model,
typeof_impl,
)
from numba.core.typing import signature as nb_signature
from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
from numba.cuda.cudadecl import registry as cuda_registry
from numba.extending import lower_builtin

from .typing import GpuStruct


def gpu_struct(this: type) -> GpuStruct:
anns = getattr(this, "__annotations__", {})

# set the .dtype attribute on the class for numpy compatibility:
setattr(this, "dtype", np.dtype(list(anns.items())))

# define __post_init__ to create a ctypes object from the fields,
# and keep a reference to it in the `._data` attribute.
def __post_init__(self):
ctypes_typ = np.ctypeslib.as_ctypes_type(this.dtype)
self._data = ctypes_typ(*(getattr(self, name) for name in this.dtype.names))

setattr(this, "__post_init__", __post_init__)

# create a dataclass:
this = dataclass(this)
fields = dataclass_fields(this)

# define a numba type corresponding to the dataclass:
class ThisType(numba.types.Type):
def __init__(self):
super().__init__(name=this.__name__)

this_type = ThisType()

@typeof_impl.register(this)
def typeof_this(val, c):
return ThisType()

# Data model corresponding to ThisType:
@register_model(ThisType)
class ThisModel(models.StructModel):
def __init__(self, dmm, fe_type):
members = [(field.name, numba.from_dtype(field.type)) for field in fields]
super().__init__(dmm, fe_type, members)

# Typing for accessing attributes (fields) of the dataclass:
class ThisAttrsTemplate(AttributeTemplate):
pass

for field in fields:
typ = field.type
name = field.name

def resolver(self, this):
return numba.from_dtype(typ)

setattr(ThisAttrsTemplate, f"resolve_{name}", resolver)

@cuda_registry.register_attr
class ThisAttrs(ThisAttrsTemplate):
key = this_type

# Lowering for attribute access:
for field in fields:
make_attribute_wrapper(ThisType, field.name, field.name)

# Register typing for constructor.
@cuda_registry.register
class TypeConstructor(ConcreteTemplate):
key = this
cases = [
nb_signature(this_type, *[numba.from_dtype(field.type) for field in fields])
]

cuda_registry.register_global(this, numba.types.Function(TypeConstructor))

def type_constructor(context, builder, sig, args):
ty = sig.return_type
retval = cgutils.create_struct_proxy(ty)(context, builder)
for field, val in zip(fields, args):
setattr(retval, field.name, val)
return retval._getvalue()

lower_builtin(this, *[numba.from_dtype(field.type) for field in fields])(
type_constructor
)

return this
6 changes: 6 additions & 0 deletions python/cuda_parallel/cuda/parallel/experimental/typing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Any

from typing_extensions import (
Protocol,
) # TODO: typing_extensions required for Python 3.7 docs env
Expand All @@ -10,3 +12,7 @@ class DeviceArrayLike(Protocol):
"""

__cuda_array_interface__: dict


# return type of @gpu_struct
GpuStruct = Any
17 changes: 11 additions & 6 deletions python/cuda_parallel/tests/test_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import cuda.parallel.experimental.algorithms as algorithms
import cuda.parallel.experimental.iterators as iterators
from cuda.parallel.experimental.gpu_struct import gpu_struct


def random_int(shape, dtype):
Expand Down Expand Up @@ -553,15 +554,19 @@ def binary_op(x, y):


def test_reduce_struct_type():
def max_g_value(x, y):
return x if x["g"] > y["g"] else y
@gpu_struct
class Pixel:
r: np.int32
g: np.int32
b: np.int32

dtype = np.dtype([("r", "int32"), ("g", "int32"), ("b", "int32")])
d_rgb = cp.random.randint(0, 256, (10, 3), dtype=cp.int32).view(dtype)
def max_g_value(x, y):
return x if x.g > y.g else y

d_out = cp.zeros(1, dtype)
d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype)
d_out = cp.zeros(1, Pixel.dtype)

h_init = np.asarray([(0, 0, 0)], dtype=dtype)
h_init = Pixel(0, 0, 0)

reduce_into = algorithms.reduce_into(d_rgb, d_out, max_g_value, h_init)
temp_storage_bytes = reduce_into(None, d_rgb, d_out, len(d_rgb), h_init)
Expand Down

0 comments on commit 33b6a96

Please sign in to comment.