From 7b8563f3e76493b6a8a7d99cbd694107a3023be3 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Sat, 4 Jan 2025 10:36:13 -0500
Subject: [PATCH] cuda.parallel: Forbid non-contiguous arrays as inputs (or
 outputs) (#3233)

* Forbid non-contiguous arrays as inputs (or outputs)

* Implement a more robust way to check for contiguity

* Don't bother if cublas unavailable

* Fix how we check for zero-element arrays

* sort imports

---------

Co-authored-by: Ashwin Srinath <shwina@users.noreply.github.com>
---
 .../cuda/parallel/experimental/_cccl.py       |  7 ++-
 .../cuda/parallel/experimental/_utils/cai.py  | 47 ++++++++++++++++++
 python/cuda_parallel/tests/test_reduce.py     | 48 +++++++++++++++++++
 3 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
index 58c5f14a398..e09191dac2c 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
@@ -10,6 +10,7 @@
 import numpy as np
 from numba import cuda, types
 
+from ._utils.cai import DeviceArrayLike, get_dtype, is_contiguous
 from .iterators._iterators import IteratorBase
 
 
@@ -131,8 +132,10 @@ def _numpy_type_to_info(numpy_type: np.dtype) -> TypeInfo:
     return _numba_type_to_info(numba_type)
 
 
-def _device_array_to_cccl_iter(array) -> Iterator:
-    info = _numpy_type_to_info(array.dtype)
+def _device_array_to_cccl_iter(array: DeviceArrayLike) -> Iterator:
+    if not is_contiguous(array):
+        raise ValueError("Non-contiguous arrays are not supported.")
+    info = _numpy_type_to_info(get_dtype(array))
     return Iterator(
         info.size,
         info.alignment,
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
index 9c0718e71f0..4d435171aad 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
@@ -7,6 +7,8 @@
 Utilities for extracting information from `__cuda_array_interface__`.
 """
 
+from typing import Optional, Tuple
+
 import numpy as np
 
 from ..typing import DeviceArrayLike
@@ -14,3 +16,48 @@
 
 def get_dtype(arr: DeviceArrayLike) -> np.dtype:
     return np.dtype(arr.__cuda_array_interface__["typestr"])
+
+
+def get_strides(arr: DeviceArrayLike) -> Optional[Tuple]:
+    return arr.__cuda_array_interface__["strides"]
+
+
+def get_shape(arr: DeviceArrayLike) -> Tuple:
+    return arr.__cuda_array_interface__["shape"]
+
+
+def is_contiguous(arr: DeviceArrayLike) -> bool:
+    shape, strides = get_shape(arr), get_strides(arr)
+
+    if strides is None:
+        return True
+
+    if any(dim == 0 for dim in shape):
+        # array has no elements
+        return True
+
+    if all(dim == 1 for dim in shape):
+        # there is a single element:
+        return True
+
+    itemsize = get_dtype(arr).itemsize
+
+    if strides[-1] == itemsize:
+        # assume C-contiguity
+        expected_stride = itemsize
+        for dim, stride in zip(reversed(shape), reversed(strides)):
+            if stride != expected_stride:
+                return False
+            expected_stride *= dim
+        return True
+    elif strides[0] == itemsize:
+        # assume F-contiguity
+        expected_stride = itemsize
+        for dim, stride in zip(shape, strides):
+            if stride != expected_stride:
+                return False
+            expected_stride *= dim
+        return True
+    else:
+        # not contiguous
+        return False
diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
index cce9412a7b8..9549ef7bee3 100644
--- a/python/cuda_parallel/tests/test_reduce.py
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -502,3 +502,51 @@ def op3(x):
         np.zeros([0], dtype="int64"),
     )
     assert reducer_1 is not reducer_2
+
+
+@pytest.fixture(params=[True, False])
+def array_2d(request):
+    f_contiguous = request.param
+    arr = cp.random.rand(5, 10)
+    if f_contiguous:
+        try:
+            return cp.asfortranarray(arr)
+        except ImportError:  # cublas unavailable
+            return arr
+    else:
+        return arr
+
+
+def test_reduce_2d_array(array_2d):
+    def binary_op(x, y):
+        return x + y
+
+    d_out = cp.empty(1, dtype=array_2d.dtype)
+    h_init = np.asarray([0], dtype=array_2d.dtype)
+    d_in = array_2d
+    reduce_into = algorithms.reduce_into(
+        d_in=d_in, d_out=d_out, op=binary_op, h_init=h_init
+    )
+    temp_storage_size = reduce_into(
+        None, d_in=d_in, d_out=d_out, num_items=d_in.size, h_init=h_init
+    )
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+    reduce_into(d_temp_storage, d_in, d_out, d_in.size, h_init)
+    np.testing.assert_allclose(d_in.sum().get(), d_out.get())
+
+
+def test_reduce_non_contiguous():
+    def binary_op(x, y):
+        return x + y
+
+    size = 10
+    d_out = cp.empty(1, dtype="int64")
+    h_init = np.asarray([0], dtype="int64")
+
+    d_in = cp.zeros((size, 2))[:, 0]
+    with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."):
+        _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init)
+
+    d_in = cp.zeros(size)[::2]
+    with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."):
+        _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init)