NVIDIA · gevtushenko · Oct 4, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 19, 2024
@@ -184,8 +184,16 @@ class _CCCLDeviceReduceBuildResult(ctypes.Structure):
                 ("reduction_kernel", ctypes.c_void_p)]
 
 
+def _dtype_validation(dt1, dt2):
+    if dt1 != dt2:
+        raise TypeError(f"dtype mismatch: __init__={dt1}, __call__={dt2}")
+
+
 class _Reduce:
     def __init__(self, d_in, d_out, op, init):
+        self._ctor_d_in_dtype = d_in.dtype
+        self._ctor_d_out_dtype = d_out.dtype
+        self._ctor_init_dtype = init.dtype
         cc_major, cc_minor = cuda.get_current_device().compute_capability
         cub_path, thrust_path, libcudacxx_path, cuda_include_path = _get_paths()
         bindings = _get_bindings()
@@ -212,7 +220,9 @@ def __init__(self, d_in, d_out, op, init):
             raise ValueError('Error building reduce')
 
     def __call__(self, temp_storage, d_in, d_out, init):
-        # TODO Assert that types match the ones used in the constructor
+        _dtype_validation(self._ctor_d_in_dtype, d_in.dtype)
+        _dtype_validation(self._ctor_d_out_dtype, d_out.dtype)
+        _dtype_validation(self._ctor_init_dtype, init.dtype)
         bindings = _get_bindings()
         if temp_storage is None:
             temp_storage_bytes = ctypes.c_size_t()

@@ -11,19 +11,21 @@
 # example-end imports
 
 
-def test_device_reduce():
-    # example-begin reduce-min
-    def op(a, b):
-        return a if a < b else b
+# example-begin reduce-min
+def min_op(a, b):
+    return a if a < b else b
+# example-end reduce-min
 
+
+def test_device_reduce_success():
     dtype = numpy.int32
     h_init = numpy.array([42], dtype)
-    h_input = numpy.array([8, 6, 7, 5, 3, 0, 9])
+    h_input = numpy.array([8, 6, 7, 5, 3, 0, 9], dtype)
     d_output = cuda.device_array(1, dtype)
     d_input = cuda.to_device(h_input)
 
     # Instantiate reduction for the given operator and initial value
-    reduce_into = cudax.reduce_into(d_output, d_output, op, h_init)
+    reduce_into = cudax.reduce_into(d_output, d_output, min_op, h_init)
 
     # Deterrmine temporary device storage requirements
     temp_storage_size = reduce_into(None, d_input, d_output, h_init)
@@ -38,3 +40,17 @@ def op(a, b):
     # example-end reduce-min
     h_output = d_output.copy_to_host()
     assert h_output[0] == expected_output
+
+
+def test_device_reduce_dtype_mismatch():
+    dtypes = [numpy.int32, numpy.int64]
+    h_inits = [numpy.array([], dt) for dt in dtypes]
+    h_inputs = [numpy.array([], dt) for dt in dtypes]
+    d_outputs = [cuda.device_array(1, dt) for dt in dtypes]
+    d_inputs = [cuda.to_device(h_inp) for h_inp in h_inputs]
+
+    reduce_into = cudax.reduce_into(d_inputs[0], d_outputs[0], min_op, h_inits[0])
+
+    for ix in range(3):
+        with pytest.raises(TypeError, match=r"dtype mismatch: __init__=int32, __call__=int64"):
+          reduce_into(None, d_inputs[int(ix == 0)], d_outputs[int(ix == 1)], h_inits[int(ix == 2)])