From 8dd015a4e68aeee4fddaee5e52bb692cfa1b219a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 30 Aug 2024 07:02:50 +0000
Subject: [PATCH 1/4] ensure cupy arrays can be used with cuda.parallel too

---
 .../cuda/parallel/experimental/__init__.py    |  9 ++++++---
 python/cuda_parallel/tests/test_reduce_api.py | 19 +++++++++----------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
index 4a16fc1b67a..4b4877cdf09 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
@@ -101,13 +101,15 @@ def _type_to_info(numpy_type):
 def _device_array_to_pointer(array):
     dtype = array.dtype
     info = _type_to_info(dtype)
-    return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.device_ctypes_pointer.value)
+    # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI
+    # TODO: switch to use gpumemoryview once it's ready
+    return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.__cuda_array_interface__["data"][0])
 
 
 def _host_array_to_value(array):
     dtype = array.dtype
     info = _type_to_info(dtype)
-    return _CCCLValue(info, array.ctypes.data_as(ctypes.c_void_p))
+    return _CCCLValue(info, array.ctypes.data)
 
 
 class _Op:
@@ -219,7 +221,8 @@ def __call__(self, temp_storage, d_in, d_out, init):
             d_temp_storage = None
         else:
             temp_storage_bytes = ctypes.c_size_t(temp_storage.nbytes)
-            d_temp_storage = temp_storage.device_ctypes_pointer.value
+            # see comment in _device_array_to_pointer
+            d_temp_storage = temp_storage.__cuda_array_interface__["data"][0]
         d_in_ptr = _device_array_to_pointer(d_in)
         d_out_ptr = _device_array_to_pointer(d_out)
         num_items = ctypes.c_ulonglong(d_in.size)
diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
index 6ed35831218..a086fab6115 100644
--- a/python/cuda_parallel/tests/test_reduce_api.py
+++ b/python/cuda_parallel/tests/test_reduce_api.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import numpy
+import cupy as cp
+import numpy as np
 import pytest
-from numba import cuda
 
 # example-begin imports
 import cuda.parallel.experimental as cudax
@@ -16,11 +16,10 @@ def test_device_reduce():
     def op(a, b):
         return a if a < b else b
 
-    dtype = numpy.int32
-    h_init = numpy.array([42], dtype)
-    h_input = numpy.array([8, 6, 7, 5, 3, 0, 9])
-    d_output = cuda.device_array(1, dtype)
-    d_input = cuda.to_device(h_input)
+    dtype = np.int32
+    h_init = np.array([42], dtype=dtype)
+    d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     # Instantiate reduction for the given operator and initial value
     reduce_into = cudax.reduce_into(d_output, d_output, op, h_init)
@@ -29,12 +28,12 @@ def op(a, b):
     temp_storage_size = reduce_into(None, d_input, d_output, h_init)
 
     # Allocate temporary storage
-    d_temp_storage = cuda.device_array(temp_storage_size, dtype=numpy.uint8)
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
 
     # Run reduction
     reduce_into(d_temp_storage, d_input, d_output, h_init)
 
+    # Check the result is correct
     expected_output = 0
+    assert (d_output == expected_output).all()
     # example-end reduce-min
-    h_output = d_output.copy_to_host()
-    assert h_output[0] == expected_output

From f9e904b35dbb37c1c6d35e4921640e241e7ce98c Mon Sep 17 00:00:00 2001
From: Leo Fang <leo80042@gmail.com>
Date: Fri, 30 Aug 2024 15:11:37 +0800
Subject: [PATCH 2/4] copy the same comment to all places wherever applicable

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 python/cuda_parallel/cuda/parallel/experimental/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
index 4b4877cdf09..5040d669888 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
@@ -221,7 +221,8 @@ def __call__(self, temp_storage, d_in, d_out, init):
             d_temp_storage = None
         else:
             temp_storage_bytes = ctypes.c_size_t(temp_storage.nbytes)
-            # see comment in _device_array_to_pointer
+            # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI
+            # TODO: switch to use gpumemoryview once it's ready
             d_temp_storage = temp_storage.__cuda_array_interface__["data"][0]
         d_in_ptr = _device_array_to_pointer(d_in)
         d_out_ptr = _device_array_to_pointer(d_out)

From 93ea29872b59f4081d4c99bdc5d3fb290db77a07 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 1 Sep 2024 22:28:38 +0800
Subject: [PATCH 3/4] ensure all needed imports are shown in the example

---
 python/cuda_parallel/tests/test_reduce_api.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
index a086fab6115..e07dc2eb7e5 100644
--- a/python/cuda_parallel/tests/test_reduce_api.py
+++ b/python/cuda_parallel/tests/test_reduce_api.py
@@ -2,14 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+# example-begin imports
 import cupy as cp
 import numpy as np
-import pytest
-
-# example-begin imports
 import cuda.parallel.experimental as cudax
 # example-end imports
 
+import pytest
+
 
 def test_device_reduce():
     # example-begin reduce-min

From a7a46d9608f4c7bc3e7d9503849ce54d2b8f3459 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 1 Sep 2024 22:30:29 +0800
Subject: [PATCH 4/4] add CuPy (+CUDA 12.x) as a test dependency

---
 python/cuda_parallel/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index c29a5237fc0..c71e8c456f9 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -114,6 +114,7 @@ def build_extension(self, ext):
     extras_require={
         "test": [
             "pytest",
+            "cupy-cuda12x",
         ]
     },
     cmdclass={