diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
index 65710954b0b..969184a077e 100644
--- a/python/cuda_parallel/tests/test_reduce.py
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
@@ -552,27 +552,18 @@ def binary_op(x, y):
         _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init)
 
 
-def test_reduce_with_stream():
-    # Simple cupy stream wrapper that implements the __cuda_stream__ protocol for the purposes of this test
-    class Stream:
-        def __init__(self, cp_stream):
-            self.cp_stream = cp_stream
-
-        def __cuda_stream__(self):
-            return (0, self.cp_stream.ptr)
-
+def test_reduce_with_stream(cuda_stream):
     def add_op(x, y):
         return x + y
 
     h_init = np.asarray([0], dtype=np.int32)
     h_in = random_int(5, np.int32)
 
-    stream = cp.cuda.Stream()
-    with stream:
+    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
+    with cp_stream:
         d_in = cp.asarray(h_in)
         d_out = cp.empty(1, dtype=np.int32)
 
-    stream_wrapper = Stream(stream)
     reduce_into = algorithms.reduce_into(
         d_in=d_in, d_out=d_out, op=add_op, h_init=h_init
     )
@@ -582,13 +573,13 @@ def add_op(x, y):
         d_out=d_out,
         num_items=d_in.size,
         h_init=h_init,
-        stream=stream_wrapper,
+        stream=cuda_stream,
     )
-    with stream:
+    with cp_stream:
         d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
 
-    reduce_into(d_temp_storage, d_in, d_out, d_in.size, h_init, stream=stream_wrapper)
-    with stream:
+    reduce_into(d_temp_storage, d_in, d_out, d_in.size, h_init, stream=cuda_stream)
+    with cp_stream:
         cp.testing.assert_allclose(d_in.sum().get(), d_out.get())