From 2afcb2051e7e9471fa36d06d2910328d9520ba8a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 21 Feb 2025 00:16:27 +0000
Subject: [PATCH 1/9] cache cc to speed it up

---
 cuda_core/cuda/core/experimental/_device.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 0cbd462cd..36a111e3e 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -1029,13 +1029,11 @@ def properties(self) -> DeviceProperties:
     @property
     def compute_capability(self) -> ComputeCapability:
         """Return a named tuple with 2 fields: major and minor."""
-        major = handle_return(
-            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)
-        )
-        minor = handle_return(
-            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)
-        )
-        return ComputeCapability(major, minor)
+        if "compute_capability" in self.properties._cache:
+            return self.properties._cache["compute_capability"]
+        cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor)
+        self.properties._cache["compute_capability"] = cc
+        return cc
 
     @property
     @precondition(_check_context_initialized)

From 87405ad907bce2f802e331dd9044b9815264df63 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 21 Feb 2025 19:26:58 +0000
Subject: [PATCH 2/9] avoid using cudart APIs in Device constructor

---
 cuda_core/cuda/core/experimental/_device.py | 61 +++++++++++++--------
 cuda_core/tests/conftest.py                 |  3 +-
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 36a111e3e..d703e0161 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -11,7 +11,8 @@
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
 
 _tls = threading.local()
-_tls_lock = threading.Lock()
+_lock = threading.Lock()
+_is_cuInit = False
 
 
 class DeviceProperties:
@@ -938,37 +939,51 @@ class Device:
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id=None):
+        global _is_cuInit
+        if _is_cuInit is False:
+            with _lock:
+                handle_return(driver.cuInit(0))
+                _is_cuInit = True
+
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
-            device_id = handle_return(runtime.cudaGetDevice())
+            err, dev = driver.cuCtxGetDevice()
+            if err == 0:
+                device_id = int(dev)
+            else:
+                ctx = handle_return(driver.cuCtxGetCurrent())
+                assert int(ctx) == 0
+                device_id = 0  # cudart behavior
             assert isinstance(device_id, int), f"{device_id=}"
         else:
-            total = handle_return(runtime.cudaGetDeviceCount())
+            total = handle_return(driver.cuDeviceGetCount())
             if not isinstance(device_id, int) or not (0 <= device_id < total):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
-        with _tls_lock:
-            if not hasattr(_tls, "devices"):
-                total = handle_return(runtime.cudaGetDeviceCount())
-                _tls.devices = []
-                for dev_id in range(total):
-                    dev = super().__new__(cls)
-                    dev._id = dev_id
-                    # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the SynchronousMemoryResource which does not use memory pools.
-                    if (
-                        handle_return(
-                            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+        if not hasattr(_tls, "devices"):
+            total = handle_return(driver.cuDeviceGetCount())
+            _tls.devices = []
+            for dev_id in range(total):
+                dev = super().__new__(cls)
+
+                dev._id = dev_id
+                # If the device is in TCC mode, or does not support memory pools for some other reason,
+                # use the SynchronousMemoryResource which does not use memory pools.
+                if (
+                    handle_return(
+                        driver.cuDeviceGetAttribute(
+                            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
                         )
-                    ) == 1:
-                        dev._mr = _DefaultAsyncMempool(dev_id)
-                    else:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-
-                    dev._has_inited = False
-                    dev._properties = None
-                    _tls.devices.append(dev)
+                    )
+                ) == 1:
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                else:
+                    dev._mr = _SynchronousMemoryResource(dev_id)
+                dev._has_inited = False
+                dev._properties = None
+
+                _tls.devices.append(dev)
 
         return _tls.devices[device_id]
 
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index dc50585ab..72bbeae83 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -42,8 +42,7 @@ def _device_unset_current():
         return
     handle_return(driver.cuCtxPopCurrent())
     if hasattr(_device._tls, "devices"):
-        with _device._tls_lock:
-            del _device._tls.devices
+        del _device._tls.devices
 
 
 @pytest.fixture(scope="function")

From 95777c478ca02d2c94f5e56aced9254872038fbf Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 21 Feb 2025 19:36:25 +0000
Subject: [PATCH 3/9] avoid silly, redundant lock

---
 cuda_core/cuda/core/experimental/_device.py | 50 ++++++++++++---------
 cuda_core/tests/conftest.py                 |  3 +-
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 36a111e3e..74888dca8 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -11,7 +11,8 @@
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
 
 _tls = threading.local()
-_tls_lock = threading.Lock()
+_lock = threading.Lock()
+_is_cuInit = False
 
 
 class DeviceProperties:
@@ -938,6 +939,12 @@ class Device:
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id=None):
+        global _is_cuInit
+        if _is_cuInit is False:
+            with _lock:
+                handle_return(driver.cuInit(0))
+                _is_cuInit = True
+
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(runtime.cudaGetDevice())
@@ -948,27 +955,26 @@ def __new__(cls, device_id=None):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
-        with _tls_lock:
-            if not hasattr(_tls, "devices"):
-                total = handle_return(runtime.cudaGetDeviceCount())
-                _tls.devices = []
-                for dev_id in range(total):
-                    dev = super().__new__(cls)
-                    dev._id = dev_id
-                    # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the SynchronousMemoryResource which does not use memory pools.
-                    if (
-                        handle_return(
-                            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
-                        )
-                    ) == 1:
-                        dev._mr = _DefaultAsyncMempool(dev_id)
-                    else:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-
-                    dev._has_inited = False
-                    dev._properties = None
-                    _tls.devices.append(dev)
+        if not hasattr(_tls, "devices"):
+            total = handle_return(runtime.cudaGetDeviceCount())
+            _tls.devices = []
+            for dev_id in range(total):
+                dev = super().__new__(cls)
+                dev._id = dev_id
+                # If the device is in TCC mode, or does not support memory pools for some other reason,
+                # use the SynchronousMemoryResource which does not use memory pools.
+                if (
+                    handle_return(
+                        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+                    )
+                ) == 1:
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                else:
+                    dev._mr = _SynchronousMemoryResource(dev_id)
+
+                dev._has_inited = False
+                dev._properties = None
+                _tls.devices.append(dev)
 
         return _tls.devices[device_id]
 
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index dc50585ab..72bbeae83 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -42,8 +42,7 @@ def _device_unset_current():
         return
     handle_return(driver.cuCtxPopCurrent())
     if hasattr(_device._tls, "devices"):
-        with _device._tls_lock:
-            del _device._tls.devices
+        del _device._tls.devices
 
 
 @pytest.fixture(scope="function")

From c9fac0b662577425d16074d1ad549e63693164fb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 24 May 2025 02:11:39 +0000
Subject: [PATCH 4/9] minor perf opt: try-except + skip assert

---
 cuda_core/cuda/core/experimental/_device.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 428757374..cba23d5a1 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -964,19 +964,19 @@ def __new__(cls, device_id=None):
                 ctx = handle_return(driver.cuCtxGetCurrent())
                 assert int(ctx) == 0
                 device_id = 0  # cudart behavior
-            assert isinstance(device_id, int), f"{device_id=}"
         else:
             total = handle_return(driver.cuDeviceGetCount())
             if not isinstance(device_id, int) or not (0 <= device_id < total):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
-        if not hasattr(_tls, "devices"):
+        try:
+            devices = _tls.devices
+        except AttributeError:
             total = handle_return(driver.cuDeviceGetCount())
-            _tls.devices = []
+            devices = _tls.devices = []
             for dev_id in range(total):
                 dev = super().__new__(cls)
-
                 dev._id = dev_id
                 # If the device is in TCC mode, or does not support memory pools for some other reason,
                 # use the SynchronousMemoryResource which does not use memory pools.
@@ -990,12 +990,12 @@ def __new__(cls, device_id=None):
                     dev._mr = _DefaultAsyncMempool(dev_id)
                 else:
                     dev._mr = _SynchronousMemoryResource(dev_id)
+
                 dev._has_inited = False
                 dev._properties = None
+                devices.append(dev)
 
-                _tls.devices.append(dev)
-
-        return _tls.devices[device_id]
+        return devices[device_id]
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:

From 6b245ff1784004816c32429ccaf6517218e03ace Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Jun 2025 15:22:01 +0000
Subject: [PATCH 5/9] also optimize for explicit dev id

---
 cuda_core/cuda/core/experimental/_device.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index cba23d5a1..4ec612ac5 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -13,6 +13,7 @@
 from cuda.core.experimental._utils.cuda_utils import (
     ComputeCapability,
     CUDAError,
+    _check_driver_error,
     driver,
     handle_return,
     precondition,
@@ -948,7 +949,7 @@ class Device:
 
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
-    def __new__(cls, device_id=None):
+    def __new__(cls, device_id: int = None):
         global _is_cuInit
         if _is_cuInit is False:
             with _lock:
@@ -960,14 +961,14 @@ def __new__(cls, device_id=None):
             err, dev = driver.cuCtxGetDevice()
             if err == 0:
                 device_id = int(dev)
-            else:
+            elif err == 201:  # CUDA_ERROR_INVALID_CONTEXT
                 ctx = handle_return(driver.cuCtxGetCurrent())
                 assert int(ctx) == 0
                 device_id = 0  # cudart behavior
-        else:
-            total = handle_return(driver.cuDeviceGetCount())
-            if not isinstance(device_id, int) or not (0 <= device_id < total):
-                raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
+            else:
+                _check_driver_error(err)
+        elif device_id < 0:
+            raise ValueError(f"device_id must be >= 0, got {device_id}")
 
         # ensure Device is singleton
         try:
@@ -995,7 +996,10 @@ def __new__(cls, device_id=None):
                 dev._properties = None
                 devices.append(dev)
 
-        return devices[device_id]
+        try:
+            return devices[device_id]
+        except IndexError:
+            raise ValueError(f"device_id must be within [0, {len(devices)}), got {device_id}") from None
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:

From d70ec240188afa97c1ee7e26b411d3a60f8808dd Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Jun 2025 15:36:58 +0000
Subject: [PATCH 6/9] update release notes

---
 cuda_core/docs/source/release/0.3.0-notes.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst
index 0f8cc77ae..4580cc1ec 100644
--- a/cuda_core/docs/source/release/0.3.0-notes.rst
+++ b/cuda_core/docs/source/release/0.3.0-notes.rst
@@ -22,7 +22,7 @@ New features
 
 - :class:`Kernel` adds :attr:`Kernel.num_arguments` and :attr:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
 - Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648)
-- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`.
+- Support launching cooperative kernels by setting :attr:`LaunchConfig.cooperative_launch` to `True`.
 - A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective
   options.
 
@@ -34,5 +34,6 @@ New examples
 Fixes and enhancements
 ----------------------
 
-- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
+- Look-up of the :attr:`Event.device` and :attr:`Event.context` (the device and CUDA context where an event was created from) is now possible.
 - The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed.
+- The :class:`Device` constructor is made faster.

From d279e506bc3d8f6005324b26e18c1206b82c5495 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Jun 2025 17:03:51 +0000
Subject: [PATCH 7/9] debug sanitizer

---
 ci/tools/env-vars | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/tools/env-vars b/ci/tools/env-vars
index 7211dc08f..1771719d6 100755
--- a/ci/tools/env-vars
+++ b/ci/tools/env-vars
@@ -60,7 +60,8 @@ elif [[ "${1}" == "test" ]]; then
   # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
   # Only local ctk installs have compute-sanitizer; there is no wheel for it
   if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then
-    echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV
+    #echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV
+    echo "LATEST_CUDA_VERSION=12.9.0" >> $GITHUB_ENV
     SETUP_SANITIZER=1
   else
     SETUP_SANITIZER=0

From 708fd70fd6cb5be142f3651ff9b6ab0e9ec92440 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Jun 2025 02:14:10 +0000
Subject: [PATCH 8/9] fix type hint; compare against enums

---
 cuda_core/cuda/core/experimental/_device.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index d2191601e..7451f1ddc 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -931,6 +931,10 @@ def multicast_supported(self) -> bool:
         return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED))
 
 
+_SUCCESS = driver.CUresult.CUDA_SUCCESS
+_INVALID_CTX = driver.CUresult.CUDA_ERROR_INVALID_CONTEXT
+
+
 class Device:
     """Represent a GPU and act as an entry point for cuda.core features.
 
@@ -960,7 +964,7 @@ class Device:
 
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
-    def __new__(cls, device_id: int = None):
+    def __new__(cls, device_id: Optional[int] = None):
         global _is_cuInit
         if _is_cuInit is False:
             with _lock:
@@ -970,9 +974,9 @@ def __new__(cls, device_id: int = None):
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             err, dev = driver.cuCtxGetDevice()
-            if err == 0:
+            if err == _SUCCESS:
                 device_id = int(dev)
-            elif err == 201:  # CUDA_ERROR_INVALID_CONTEXT
+            elif err == _INVALID_CTX:
                 ctx = handle_return(driver.cuCtxGetCurrent())
                 assert int(ctx) == 0
                 device_id = 0  # cudart behavior

From 4015f9c82ead3acb33b3c1ecf1707431b6d09afe Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 7 Jun 2025 02:15:37 +0000
Subject: [PATCH 9/9] Revert "debug sanitizer"

This reverts commit d279e506bc3d8f6005324b26e18c1206b82c5495.
---
 ci/tools/env-vars | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/tools/env-vars b/ci/tools/env-vars
index 1771719d6..7211dc08f 100755
--- a/ci/tools/env-vars
+++ b/ci/tools/env-vars
@@ -60,8 +60,7 @@ elif [[ "${1}" == "test" ]]; then
   # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
   # Only local ctk installs have compute-sanitizer; there is no wheel for it
   if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then
-    #echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV
-    echo "LATEST_CUDA_VERSION=12.9.0" >> $GITHUB_ENV
+    echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV
     SETUP_SANITIZER=1
   else
     SETUP_SANITIZER=0