diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py index 36813a737a9..b49d79c43b8 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py +++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py @@ -101,13 +101,15 @@ def _type_to_info(numpy_type): def _device_array_to_pointer(array): dtype = array.dtype info = _type_to_info(dtype) - return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.device_ctypes_pointer.value) + # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI + # TODO: switch to use gpumemoryview once it's ready + return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.__cuda_array_interface__["data"][0]) def _host_array_to_value(array): dtype = array.dtype info = _type_to_info(dtype) - return _CCCLValue(info, array.ctypes.data_as(ctypes.c_void_p)) + return _CCCLValue(info, array.ctypes.data) class _Op: @@ -230,7 +232,9 @@ def __call__(self, temp_storage, d_in, d_out, init): d_temp_storage = None else: temp_storage_bytes = ctypes.c_size_t(temp_storage.nbytes) - d_temp_storage = temp_storage.device_ctypes_pointer.value + # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI + # TODO: switch to use gpumemoryview once it's ready + d_temp_storage = temp_storage.__cuda_array_interface__["data"][0] d_in_ptr = _device_array_to_pointer(d_in) d_out_ptr = _device_array_to_pointer(d_out) num_items = ctypes.c_ulonglong(d_in.size) diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py index 3a25f7d89d1..a8bc82340a6 100644 --- a/python/cuda_parallel/setup.py +++ b/python/cuda_parallel/setup.py @@ -112,6 +112,7 @@ def build_extension(self, ext): extras_require={ "test": [ "pytest", + "cupy-cuda12x", ] }, cmdclass={ diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py index 8c63364559c..9eccee8622c 100644 --- a/python/cuda_parallel/tests/test_reduce_api.py +++ b/python/cuda_parallel/tests/test_reduce_api.py @@ -2,25 +2,24 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import numpy -import pytest -from numba import cuda - # example-begin imports +import cupy as cp +import numpy as np import cuda.parallel.experimental as cudax # example-end imports +import pytest + def test_device_reduce(): # example-begin reduce-min def min_op(a, b): return a if a < b else b - dtype = numpy.int32 - h_init = numpy.array([42], dtype) - h_input = numpy.array([8, 6, 7, 5, 3, 0, 9], dtype) - d_output = cuda.device_array(1, dtype) - d_input = cuda.to_device(h_input) + dtype = np.int32 + h_init = np.array([42], dtype=dtype) + d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype) + d_output = cp.empty(1, dtype=dtype) # Instantiate reduction for the given operator and initial value reduce_into = cudax.reduce_into(d_output, d_output, min_op, h_init) @@ -29,12 +28,12 @@ def min_op(a, b): temp_storage_size = reduce_into(None, d_input, d_output, h_init) # Allocate temporary storage - d_temp_storage = cuda.device_array(temp_storage_size, dtype=numpy.uint8) + d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8) # Run reduction reduce_into(d_temp_storage, d_input, d_output, h_init) + # Check the result is correct expected_output = 0 + assert (d_output == expected_output).all() # example-end reduce-min - h_output = d_output.copy_to_host() - assert h_output[0] == expected_output