diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
index 36813a737a9..b49d79c43b8 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
@@ -101,13 +101,15 @@ def _type_to_info(numpy_type):
 def _device_array_to_pointer(array):
     dtype = array.dtype
     info = _type_to_info(dtype)
-    return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.device_ctypes_pointer.value)
+    # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI
+    # TODO: switch to use gpumemoryview once it's ready
+    return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.__cuda_array_interface__["data"][0])
 
 
 def _host_array_to_value(array):
     dtype = array.dtype
     info = _type_to_info(dtype)
-    return _CCCLValue(info, array.ctypes.data_as(ctypes.c_void_p))
+    return _CCCLValue(info, array.ctypes.data)
 
 
 class _Op:
@@ -230,7 +232,9 @@ def __call__(self, temp_storage, d_in, d_out, init):
             d_temp_storage = None
         else:
             temp_storage_bytes = ctypes.c_size_t(temp_storage.nbytes)
-            d_temp_storage = temp_storage.device_ctypes_pointer.value
+            # Note: this is slightly slower, but supports all ndarray-like objects as long as they support CAI
+            # TODO: switch to use gpumemoryview once it's ready
+            d_temp_storage = temp_storage.__cuda_array_interface__["data"][0]
         d_in_ptr = _device_array_to_pointer(d_in)
         d_out_ptr = _device_array_to_pointer(d_out)
         num_items = ctypes.c_ulonglong(d_in.size)
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 3a25f7d89d1..a8bc82340a6 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -112,6 +112,7 @@ def build_extension(self, ext):
     extras_require={
         "test": [
             "pytest",
+            "cupy-cuda12x",
         ]
     },
     cmdclass={
diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
index 8c63364559c..9eccee8622c 100644
--- a/python/cuda_parallel/tests/test_reduce_api.py
+++ b/python/cuda_parallel/tests/test_reduce_api.py
@@ -2,25 +2,24 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import numpy
-import pytest
-from numba import cuda
-
 # example-begin imports
+import cupy as cp
+import numpy as np
 import cuda.parallel.experimental as cudax
 # example-end imports
 
+import pytest
+
 
 def test_device_reduce():
     # example-begin reduce-min
     def min_op(a, b):
         return a if a < b else b
 
-    dtype = numpy.int32
-    h_init = numpy.array([42], dtype)
-    h_input = numpy.array([8, 6, 7, 5, 3, 0, 9], dtype)
-    d_output = cuda.device_array(1, dtype)
-    d_input = cuda.to_device(h_input)
+    dtype = np.int32
+    h_init = np.array([42], dtype=dtype)
+    d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype)
+    d_output = cp.empty(1, dtype=dtype)
 
     # Instantiate reduction for the given operator and initial value
     reduce_into = cudax.reduce_into(d_output, d_output, min_op, h_init)
@@ -29,12 +28,12 @@ def min_op(a, b):
     temp_storage_size = reduce_into(None, d_input, d_output, h_init)
 
     # Allocate temporary storage
-    d_temp_storage = cuda.device_array(temp_storage_size, dtype=numpy.uint8)
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
 
     # Run reduction
     reduce_into(d_temp_storage, d_input, d_output, h_init)
 
+    # Check the result is correct
     expected_output = 0
+    assert (d_output == expected_output).all()
     # example-end reduce-min
-    h_output = d_output.copy_to_host()
-    assert h_output[0] == expected_output