diff --git a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py index 58c5f14a398..e09191dac2c 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py +++ b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py @@ -10,6 +10,7 @@ import numpy as np from numba import cuda, types +from ._utils.cai import DeviceArrayLike, get_dtype, is_contiguous from .iterators._iterators import IteratorBase @@ -131,8 +132,10 @@ def _numpy_type_to_info(numpy_type: np.dtype) -> TypeInfo: return _numba_type_to_info(numba_type) -def _device_array_to_cccl_iter(array) -> Iterator: - info = _numpy_type_to_info(array.dtype) +def _device_array_to_cccl_iter(array: DeviceArrayLike) -> Iterator: + if not is_contiguous(array): + raise ValueError("Non-contiguous arrays are not supported.") + info = _numpy_type_to_info(get_dtype(array)) return Iterator( info.size, info.alignment, diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py index 9c0718e71f0..4d435171aad 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py +++ b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py @@ -7,6 +7,8 @@ Utilities for extracting information from `__cuda_array_interface__`. """ +from typing import Optional, Tuple + import numpy as np from ..typing import DeviceArrayLike @@ -14,3 +16,48 @@ def get_dtype(arr: DeviceArrayLike) -> np.dtype: return np.dtype(arr.__cuda_array_interface__["typestr"]) + + +def get_strides(arr: DeviceArrayLike) -> Optional[Tuple]: + return arr.__cuda_array_interface__["strides"] + + +def get_shape(arr: DeviceArrayLike) -> Tuple: + return arr.__cuda_array_interface__["shape"] + + +def is_contiguous(arr: DeviceArrayLike) -> bool: + shape, strides = get_shape(arr), get_strides(arr) + + if strides is None: + return True + + if any(dim == 0 for dim in shape): + # array has no elements + return True + + if all(dim == 1 for dim in shape): + # there is a single element: + return True + + itemsize = get_dtype(arr).itemsize + + if strides[-1] == itemsize: + # assume C-contiguity + expected_stride = itemsize + for dim, stride in zip(reversed(shape), reversed(strides)): + if stride != expected_stride: + return False + expected_stride *= dim + return True + elif strides[0] == itemsize: + # assume F-contiguity + expected_stride = itemsize + for dim, stride in zip(shape, strides): + if stride != expected_stride: + return False + expected_stride *= dim + return True + else: + # not contiguous + return False diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py index cce9412a7b8..9549ef7bee3 100644 --- a/python/cuda_parallel/tests/test_reduce.py +++ b/python/cuda_parallel/tests/test_reduce.py @@ -502,3 +502,51 @@ def op3(x): np.zeros([0], dtype="int64"), ) assert reducer_1 is not reducer_2 + + +@pytest.fixture(params=[True, False]) +def array_2d(request): + f_contiguous = request.param + arr = cp.random.rand(5, 10) + if f_contiguous: + try: + return cp.asfortranarray(arr) + except ImportError: # cublas unavailable + return arr + else: + return arr + + +def test_reduce_2d_array(array_2d): + def binary_op(x, y): + return x + y + + d_out = cp.empty(1, dtype=array_2d.dtype) + h_init = np.asarray([0], dtype=array_2d.dtype) + d_in = array_2d + reduce_into = algorithms.reduce_into( + d_in=d_in, d_out=d_out, op=binary_op, h_init=h_init + ) + temp_storage_size = reduce_into( + None, d_in=d_in, d_out=d_out, num_items=d_in.size, h_init=h_init + ) + d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8) + reduce_into(d_temp_storage, d_in, d_out, d_in.size, h_init) + np.testing.assert_allclose(d_in.sum().get(), d_out.get()) + + +def test_reduce_non_contiguous(): + def binary_op(x, y): + return x + y + + size = 10 + d_out = cp.empty(1, dtype="int64") + h_init = np.asarray([0], dtype="int64") + + d_in = cp.zeros((size, 2))[:, 0] + with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."): + _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init) + + d_in = cp.zeros(size)[::2] + with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."): + _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init)