Merge pull request #17 from fjarri/dynamic-mem

fjarri · web-flow · commit 7b7adddcd491 · 2024-07-27T12:59:55.000-10:00
Rename `local_mem` to `cu_dynamic_local_mem`
diff --git a/docs/history.rst b/docs/history.rst
@@ -2,6 +2,19 @@ Version history
 ===============
 
 
+0.5.0 (unreleased)
+------------------
+
+Changed
+^^^^^^^
+
+* ``local_mem`` keyword parameter of kernel calls renamed to ``cu_dynamic_local_mem``. (PR_17_)
+
+
+.. _PR_17: https://github.com/fjarri/grunnur/pull/17
+
+
+
 0.4.0 (25 Jul 2024)
 -------------------
 
diff --git a/grunnur/adapter_base.py b/grunnur/adapter_base.py
@@ -404,6 +404,6 @@ def __call__(
         self,
         queue_adapter: QueueAdapter,
         *args: BufferAdapter | numpy.generic,
-        local_mem: int = 0,
+        cu_dynamic_local_mem: int = 0,
     ) -> Any:
         pass
diff --git a/grunnur/adapter_cuda.py b/grunnur/adapter_cuda.py
@@ -701,7 +701,7 @@ def __call__(
         self,
         queue_adapter: QueueAdapter,
         *args: BufferAdapter | numpy.generic,
-        local_mem: int = 0,
+        cu_dynamic_local_mem: int = 0,
     ) -> None:
         # Will be checked in the upper levels
         assert isinstance(queue_adapter, CuQueueAdapter)  # noqa: S101
@@ -724,5 +724,5 @@ def __call__(
             grid=self._grid,
             block=self._block,
             stream=queue_adapter._pycuda_stream,  # noqa: SLF001
-            shared=local_mem,
+            shared=cu_dynamic_local_mem,
         )
diff --git a/grunnur/adapter_opencl.py b/grunnur/adapter_opencl.py
@@ -632,11 +632,13 @@ def __call__(
         self,
         queue_adapter: QueueAdapter,
         *args: BufferAdapter | numpy.generic,
-        local_mem: int = 0,
+        cu_dynamic_local_mem: int = 0,
     ) -> pyopencl.Event:
-        # Local memory size is passed via regular kernel arguments in OpenCL.
-        # Should be checked in `PreparedKernel`.
-        assert local_mem == 0  # noqa: S101
+        if cu_dynamic_local_mem != 0:
+            raise ValueError(
+                "`cu_dynamic_local_mem` must be zero for OpenCL kernels; "
+                "dynamic local memory allocation is not supported"
+            )
 
         # We have to keep the signature more general because of the base class,
         # but the upper levels will ensure this is the case.
diff --git a/grunnur/program.py b/grunnur/program.py
@@ -314,7 +314,7 @@ def __call__(
         self,
         queue: Queue | MultiQueue,
         *args: MultiArray | Array | Buffer | numpy.generic,
-        local_mem: int = 0,
+        cu_dynamic_local_mem: int = 0,
     ) -> Any:
         """
         Enqueues the kernel on the devices in the given queue.
@@ -332,8 +332,10 @@ def __call__(
         If an argument is a integer-keyed ``dict``, its values corresponding to the
         device indices the kernel is executed on will be passed as kernel arguments.
 
+        :param cu_dynamic_local_mem: **CUDA only.** The size of dynamically allocated local
+            (shared in CUDA terms) memory, in bytes. That is, the size of
+            ``extern __shared__`` arrays in CUDA kernels.
         :param args: kernel arguments.
-        :param kwds: backend-specific keyword parameters.
         :returns: a list of ``Event`` objects for enqueued kernels in case of PyOpenCL.
         """
         if isinstance(queue, Queue):
@@ -357,7 +359,11 @@ def __call__(
             single_queue = queue.queues[device]
 
             pkernel = self._prepared_kernel_adapters[device]
-            ret_val = pkernel(single_queue._queue_adapter, *kernel_args, local_mem=local_mem)  # noqa: SLF001
+            ret_val = pkernel(
+                single_queue._queue_adapter,  # noqa: SLF001
+                *kernel_args,
+                cu_dynamic_local_mem=cu_dynamic_local_mem,
+            )
             ret_vals.append(ret_val)
 
         return ret_vals
@@ -455,11 +461,11 @@ def __call__(
         global_size: Sequence[int] | Mapping[BoundDevice, Sequence[int]],
         local_size: Sequence[int] | None | Mapping[BoundDevice, Sequence[int] | None] = None,
         *args: MultiArray | Array | Buffer | numpy.generic,
-        local_mem: int = 0,
+        cu_dynamic_local_mem: int = 0,
     ) -> Any:
         """
         A shortcut for :py:meth:`Kernel.prepare` and subsequent :py:meth:`PreparedKernel.__call__`.
         See their doc entries for details.
         """
         pkernel = self.prepare(global_size, local_size)
-        return pkernel(queue, *args, local_mem=local_mem)
+        return pkernel(queue, *args, cu_dynamic_local_mem=cu_dynamic_local_mem)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "grunnur"
-version = "0.4.0"
+version = "0.5.0.dev"
 description = "Uniform API for PyOpenCL and PyCUDA."
 authors = [
     {name = "Bogdan Opanchuk", email = "bogdan@opanchuk.net"},
diff --git a/tests/test_program.py b/tests/test_program.py
@@ -87,7 +87,7 @@ def test_compile(mock_or_real_context, no_prelude):
         assert (res == ref).all()
 
     # Explicit local_size
-    res2_dev = Array.from_host(queue, a)  # Array.empty(queue, length, numpy.int32)
+    res2_dev = Array.empty(context.device, [length], numpy.int32)
     program.kernel.multiply(queue, [length], [length // 2], res2_dev, a_dev, b_dev, c)
     res2 = res2_dev.get(queue)
     if not mocked:
@@ -518,3 +518,19 @@ def test_builtin_globals(mock_backend_pycuda):
 
     assert "max_total_local_size = 1024" in program.sources[context.devices[0]].source
     assert "max_total_local_size = 512" in program.sources[context.devices[1]].source
+
+
+def test_cu_dynamic_local_mem(mock_context):
+    src = MockDefTemplate(kernels=[MockKernel("test", [numpy.int32])])
+    program = Program([mock_context.device], src)
+    queue = Queue(mock_context.device)
+
+    if mock_context.api.id == opencl_api_id():
+        message = (
+            "`cu_dynamic_local_mem` must be zero for OpenCL kernels; "
+            "dynamic local memory allocation is not supported"
+        )
+        with pytest.raises(ValueError, match=message):
+            program.kernel.test(queue, [100], [100], numpy.int32(1), cu_dynamic_local_mem=100)
+    else:
+        program.kernel.test(queue, [100], [100], numpy.int32(1), cu_dynamic_local_mem=100)