From 8dd015a4e68aeee4fddaee5e52bb692cfa1b219a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 30 Aug 2024 07:02:50 +0000 Subject: [PATCH 1/4] ensure cupy arrays can be used with cuda.parallel too --- .../cuda/parallel/experimental/__init__.py | 9 ++++++--- python/cuda_parallel/tests/test_reduce_api.py | 19 +++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py index 4a16fc1b67a..4b4877cdf09 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py +++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py @@ -101,13 +101,15 @@ def _type_to_info(numpy_type): def _device_array_to_pointer(array): dtype = array.dtype info = _type_to_info(dtype) - return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.device_ctypes_pointer.value) + # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI + # TODO: switch to use gpumemoryview once it's ready + return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.__cuda_array_interface__["data"][0]) def _host_array_to_value(array): dtype = array.dtype info = _type_to_info(dtype) - return _CCCLValue(info, array.ctypes.data_as(ctypes.c_void_p)) + return _CCCLValue(info, array.ctypes.data) class _Op: @@ -219,7 +221,8 @@ def __call__(self, temp_storage, d_in, d_out, init): d_temp_storage = None else: temp_storage_bytes = ctypes.c_size_t(temp_storage.nbytes) - d_temp_storage = temp_storage.device_ctypes_pointer.value + # see comment in _device_array_to_pointer + d_temp_storage = temp_storage.__cuda_array_interface__["data"][0] d_in_ptr = _device_array_to_pointer(d_in) d_out_ptr = _device_array_to_pointer(d_out) num_items = ctypes.c_ulonglong(d_in.size) diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py index 6ed35831218..a086fab6115 100644 --- a/python/cuda_parallel/tests/test_reduce_api.py +++ b/python/cuda_parallel/tests/test_reduce_api.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import numpy +import cupy as cp +import numpy as np import pytest -from numba import cuda # example-begin imports import cuda.parallel.experimental as cudax @@ -16,11 +16,10 @@ def test_device_reduce(): def op(a, b): return a if a < b else b - dtype = numpy.int32 - h_init = numpy.array([42], dtype) - h_input = numpy.array([8, 6, 7, 5, 3, 0, 9]) - d_output = cuda.device_array(1, dtype) - d_input = cuda.to_device(h_input) + dtype = np.int32 + h_init = np.array([42], dtype=dtype) + d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype) + d_output = cp.empty(1, dtype=dtype) # Instantiate reduction for the given operator and initial value reduce_into = cudax.reduce_into(d_output, d_output, op, h_init) @@ -29,12 +28,12 @@ def op(a, b): temp_storage_size = reduce_into(None, d_input, d_output, h_init) # Allocate temporary storage - d_temp_storage = cuda.device_array(temp_storage_size, dtype=numpy.uint8) + d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8) # Run reduction reduce_into(d_temp_storage, d_input, d_output, h_init) + # Check the result is correct expected_output = 0 + assert (d_output == expected_output).all() # example-end reduce-min - h_output = d_output.copy_to_host() - assert h_output[0] == expected_output From f9e904b35dbb37c1c6d35e4921640e241e7ce98c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 30 Aug 2024 15:11:37 +0800 Subject: [PATCH 2/4] copy the same comment to all places wherever applicable Co-authored-by: Michael Schellenberger Costa --- python/cuda_parallel/cuda/parallel/experimental/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py index 4b4877cdf09..5040d669888 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py +++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py @@ -221,7 +221,8 @@ def __call__(self, temp_storage, d_in, d_out, init): d_temp_storage = None else: temp_storage_bytes = ctypes.c_size_t(temp_storage.nbytes) - # see comment in _device_array_to_pointer + # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI + # TODO: switch to use gpumemoryview once it's ready d_temp_storage = temp_storage.__cuda_array_interface__["data"][0] d_in_ptr = _device_array_to_pointer(d_in) d_out_ptr = _device_array_to_pointer(d_out) From 93ea29872b59f4081d4c99bdc5d3fb290db77a07 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 1 Sep 2024 22:28:38 +0800 Subject: [PATCH 3/4] ensure all needed imports are shown in the example --- python/cuda_parallel/tests/test_reduce_api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py index a086fab6115..e07dc2eb7e5 100644 --- a/python/cuda_parallel/tests/test_reduce_api.py +++ b/python/cuda_parallel/tests/test_reduce_api.py @@ -2,14 +2,14 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# example-begin imports import cupy as cp import numpy as np -import pytest - -# example-begin imports import cuda.parallel.experimental as cudax # example-end imports +import pytest + def test_device_reduce(): # example-begin reduce-min From a7a46d9608f4c7bc3e7d9503849ce54d2b8f3459 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 1 Sep 2024 22:30:29 +0800 Subject: [PATCH 4/4] add CuPy (+CUDA 12.x) as a test dependency --- python/cuda_parallel/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py index c29a5237fc0..c71e8c456f9 100644 --- a/python/cuda_parallel/setup.py +++ b/python/cuda_parallel/setup.py @@ -114,6 +114,7 @@ def build_extension(self, ext): extras_require={ "test": [ "pytest", + "cupy-cuda12x", ] }, cmdclass={