From 2afcb2051e7e9471fa36d06d2910328d9520ba8a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 21 Feb 2025 00:16:27 +0000 Subject: [PATCH 1/9] cache cc to speed it up --- cuda_core/cuda/core/experimental/_device.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 0cbd462cd..36a111e3e 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1029,13 +1029,11 @@ def properties(self) -> DeviceProperties: @property def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" - major = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id) - ) - minor = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id) - ) - return ComputeCapability(major, minor) + if "compute_capability" in self.properties._cache: + return self.properties._cache["compute_capability"] + cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor) + self.properties._cache["compute_capability"] = cc + return cc @property @precondition(_check_context_initialized) From 87405ad907bce2f802e331dd9044b9815264df63 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 21 Feb 2025 19:26:58 +0000 Subject: [PATCH 2/9] avoid using cudart APIs in Device constructor --- cuda_core/cuda/core/experimental/_device.py | 61 +++++++++++++-------- cuda_core/tests/conftest.py | 3 +- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 36a111e3e..d703e0161 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -11,7 +11,8 @@ from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime _tls = threading.local() -_tls_lock = threading.Lock() +_lock = threading.Lock() +_is_cuInit = False class DeviceProperties: @@ -938,37 +939,51 @@ class Device: __slots__ = ("_id", "_mr", "_has_inited", "_properties") def __new__(cls, device_id=None): + global _is_cuInit + if _is_cuInit is False: + with _lock: + handle_return(driver.cuInit(0)) + _is_cuInit = True + # important: creating a Device instance does not initialize the GPU! if device_id is None: - device_id = handle_return(runtime.cudaGetDevice()) + err, dev = driver.cuCtxGetDevice() + if err == 0: + device_id = int(dev) + else: + ctx = handle_return(driver.cuCtxGetCurrent()) + assert int(ctx) == 0 + device_id = 0 # cudart behavior assert isinstance(device_id, int), f"{device_id=}" else: - total = handle_return(runtime.cudaGetDeviceCount()) + total = handle_return(driver.cuDeviceGetCount()) if not isinstance(device_id, int) or not (0 <= device_id < total): raise ValueError(f"device_id must be within [0, {total}), got {device_id}") # ensure Device is singleton - with _tls_lock: - if not hasattr(_tls, "devices"): - total = handle_return(runtime.cudaGetDeviceCount()) - _tls.devices = [] - for dev_id in range(total): - dev = super().__new__(cls) - dev._id = dev_id - # If the device is in TCC mode, or does not support memory pools for some other reason, - # use the SynchronousMemoryResource which does not use memory pools. - if ( - handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) + if not hasattr(_tls, "devices"): + total = handle_return(driver.cuDeviceGetCount()) + _tls.devices = [] + for dev_id in range(total): + dev = super().__new__(cls) + + dev._id = dev_id + # If the device is in TCC mode, or does not support memory pools for some other reason, + # use the SynchronousMemoryResource which does not use memory pools. + if ( + handle_return( + driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id ) - ) == 1: - dev._mr = _DefaultAsyncMempool(dev_id) - else: - dev._mr = _SynchronousMemoryResource(dev_id) - - dev._has_inited = False - dev._properties = None - _tls.devices.append(dev) + ) + ) == 1: + dev._mr = _DefaultAsyncMempool(dev_id) + else: + dev._mr = _SynchronousMemoryResource(dev_id) + dev._has_inited = False + dev._properties = None + + _tls.devices.append(dev) return _tls.devices[device_id] diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index dc50585ab..72bbeae83 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -42,8 +42,7 @@ def _device_unset_current(): return handle_return(driver.cuCtxPopCurrent()) if hasattr(_device._tls, "devices"): - with _device._tls_lock: - del _device._tls.devices + del _device._tls.devices @pytest.fixture(scope="function") From 95777c478ca02d2c94f5e56aced9254872038fbf Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 21 Feb 2025 19:36:25 +0000 Subject: [PATCH 3/9] avoid silly, redundant lock --- cuda_core/cuda/core/experimental/_device.py | 50 ++++++++++++--------- cuda_core/tests/conftest.py | 3 +- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 36a111e3e..74888dca8 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -11,7 +11,8 @@ from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime _tls = threading.local() -_tls_lock = threading.Lock() +_lock = threading.Lock() +_is_cuInit = False class DeviceProperties: @@ -938,6 +939,12 @@ class Device: __slots__ = ("_id", "_mr", "_has_inited", "_properties") def __new__(cls, device_id=None): + global _is_cuInit + if _is_cuInit is False: + with _lock: + handle_return(driver.cuInit(0)) + _is_cuInit = True + # important: creating a Device instance does not initialize the GPU! if device_id is None: device_id = handle_return(runtime.cudaGetDevice()) @@ -948,27 +955,26 @@ def __new__(cls, device_id=None): raise ValueError(f"device_id must be within [0, {total}), got {device_id}") # ensure Device is singleton - with _tls_lock: - if not hasattr(_tls, "devices"): - total = handle_return(runtime.cudaGetDeviceCount()) - _tls.devices = [] - for dev_id in range(total): - dev = super().__new__(cls) - dev._id = dev_id - # If the device is in TCC mode, or does not support memory pools for some other reason, - # use the SynchronousMemoryResource which does not use memory pools. - if ( - handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) - ) - ) == 1: - dev._mr = _DefaultAsyncMempool(dev_id) - else: - dev._mr = _SynchronousMemoryResource(dev_id) - - dev._has_inited = False - dev._properties = None - _tls.devices.append(dev) + if not hasattr(_tls, "devices"): + total = handle_return(runtime.cudaGetDeviceCount()) + _tls.devices = [] + for dev_id in range(total): + dev = super().__new__(cls) + dev._id = dev_id + # If the device is in TCC mode, or does not support memory pools for some other reason, + # use the SynchronousMemoryResource which does not use memory pools. + if ( + handle_return( + runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) + ) + ) == 1: + dev._mr = _DefaultAsyncMempool(dev_id) + else: + dev._mr = _SynchronousMemoryResource(dev_id) + + dev._has_inited = False + dev._properties = None + _tls.devices.append(dev) return _tls.devices[device_id] diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index dc50585ab..72bbeae83 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -42,8 +42,7 @@ def _device_unset_current(): return handle_return(driver.cuCtxPopCurrent()) if hasattr(_device._tls, "devices"): - with _device._tls_lock: - del _device._tls.devices + del _device._tls.devices @pytest.fixture(scope="function") From c9fac0b662577425d16074d1ad549e63693164fb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 24 May 2025 02:11:39 +0000 Subject: [PATCH 4/9] minor perf opt: try-except + skip assert --- cuda_core/cuda/core/experimental/_device.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 428757374..cba23d5a1 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -964,19 +964,19 @@ def __new__(cls, device_id=None): ctx = handle_return(driver.cuCtxGetCurrent()) assert int(ctx) == 0 device_id = 0 # cudart behavior - assert isinstance(device_id, int), f"{device_id=}" else: total = handle_return(driver.cuDeviceGetCount()) if not isinstance(device_id, int) or not (0 <= device_id < total): raise ValueError(f"device_id must be within [0, {total}), got {device_id}") # ensure Device is singleton - if not hasattr(_tls, "devices"): + try: + devices = _tls.devices + except AttributeError: total = handle_return(driver.cuDeviceGetCount()) - _tls.devices = [] + devices = _tls.devices = [] for dev_id in range(total): dev = super().__new__(cls) - dev._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. @@ -990,12 +990,12 @@ def __new__(cls, device_id=None): dev._mr = _DefaultAsyncMempool(dev_id) else: dev._mr = _SynchronousMemoryResource(dev_id) + dev._has_inited = False dev._properties = None + devices.append(dev) - _tls.devices.append(dev) - - return _tls.devices[device_id] + return devices[device_id] def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: From 6b245ff1784004816c32429ccaf6517218e03ace Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Jun 2025 15:22:01 +0000 Subject: [PATCH 5/9] also optimize for explicit dev id --- cuda_core/cuda/core/experimental/_device.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index cba23d5a1..4ec612ac5 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -13,6 +13,7 @@ from cuda.core.experimental._utils.cuda_utils import ( ComputeCapability, CUDAError, + _check_driver_error, driver, handle_return, precondition, @@ -948,7 +949,7 @@ class Device: __slots__ = ("_id", "_mr", "_has_inited", "_properties") - def __new__(cls, device_id=None): + def __new__(cls, device_id: int = None): global _is_cuInit if _is_cuInit is False: with _lock: @@ -960,14 +961,14 @@ def __new__(cls, device_id=None): err, dev = driver.cuCtxGetDevice() if err == 0: device_id = int(dev) - else: + elif err == 201: # CUDA_ERROR_INVALID_CONTEXT ctx = handle_return(driver.cuCtxGetCurrent()) assert int(ctx) == 0 device_id = 0 # cudart behavior - else: - total = handle_return(driver.cuDeviceGetCount()) - if not isinstance(device_id, int) or not (0 <= device_id < total): - raise ValueError(f"device_id must be within [0, {total}), got {device_id}") + else: + _check_driver_error(err) + elif device_id < 0: + raise ValueError(f"device_id must be >= 0, got {device_id}") # ensure Device is singleton try: @@ -995,7 +996,10 @@ def __new__(cls, device_id=None): dev._properties = None devices.append(dev) - return devices[device_id] + try: + return devices[device_id] + except IndexError: + raise ValueError(f"device_id must be within [0, {len(devices)}), got {device_id}") from None def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: From d70ec240188afa97c1ee7e26b411d3a60f8808dd Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Jun 2025 15:36:58 +0000 Subject: [PATCH 6/9] update release notes --- cuda_core/docs/source/release/0.3.0-notes.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst index 0f8cc77ae..4580cc1ec 100644 --- a/cuda_core/docs/source/release/0.3.0-notes.rst +++ b/cuda_core/docs/source/release/0.3.0-notes.rst @@ -22,7 +22,7 @@ New features - :class:`Kernel` adds :attr:`Kernel.num_arguments` and :attr:`Kernel.arguments_info` for introspection of kernel arguments. (#612) - Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648) -- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`. +- Support launching cooperative kernels by setting :attr:`LaunchConfig.cooperative_launch` to `True`. - A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective options. @@ -34,5 +34,6 @@ New examples Fixes and enhancements ---------------------- -- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively. +- Look-up of the :attr:`Event.device` and :attr:`Event.context` (the device and CUDA context where an event was created from) is now possible. - The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed. +- The :class:`Device` constructor is made faster. From d279e506bc3d8f6005324b26e18c1206b82c5495 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Jun 2025 17:03:51 +0000 Subject: [PATCH 7/9] debug sanitizer --- ci/tools/env-vars | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 7211dc08f..1771719d6 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -60,7 +60,8 @@ elif [[ "${1}" == "test" ]]; then # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix # Only local ctk installs have compute-sanitizer; there is no wheel for it if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then - echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV + #echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV + echo "LATEST_CUDA_VERSION=12.9.0" >> $GITHUB_ENV SETUP_SANITIZER=1 else SETUP_SANITIZER=0 From 708fd70fd6cb5be142f3651ff9b6ab0e9ec92440 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Jun 2025 02:14:10 +0000 Subject: [PATCH 8/9] fix type hint; compare against enums --- cuda_core/cuda/core/experimental/_device.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index d2191601e..7451f1ddc 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -931,6 +931,10 @@ def multicast_supported(self) -> bool: return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED)) +_SUCCESS = driver.CUresult.CUDA_SUCCESS +_INVALID_CTX = driver.CUresult.CUDA_ERROR_INVALID_CONTEXT + + class Device: """Represent a GPU and act as an entry point for cuda.core features. @@ -960,7 +964,7 @@ class Device: __slots__ = ("_id", "_mr", "_has_inited", "_properties") - def __new__(cls, device_id: int = None): + def __new__(cls, device_id: Optional[int] = None): global _is_cuInit if _is_cuInit is False: with _lock: @@ -970,9 +974,9 @@ def __new__(cls, device_id: int = None): # important: creating a Device instance does not initialize the GPU! if device_id is None: err, dev = driver.cuCtxGetDevice() - if err == 0: + if err == _SUCCESS: device_id = int(dev) - elif err == 201: # CUDA_ERROR_INVALID_CONTEXT + elif err == _INVALID_CTX: ctx = handle_return(driver.cuCtxGetCurrent()) assert int(ctx) == 0 device_id = 0 # cudart behavior From 4015f9c82ead3acb33b3c1ecf1707431b6d09afe Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 7 Jun 2025 02:15:37 +0000 Subject: [PATCH 9/9] Revert "debug sanitizer" This reverts commit d279e506bc3d8f6005324b26e18c1206b82c5495. --- ci/tools/env-vars | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 1771719d6..7211dc08f 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -60,8 +60,7 @@ elif [[ "${1}" == "test" ]]; then # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix # Only local ctk installs have compute-sanitizer; there is no wheel for it if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then - #echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV - echo "LATEST_CUDA_VERSION=12.9.0" >> $GITHUB_ENV + echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV SETUP_SANITIZER=1 else SETUP_SANITIZER=0