Skip to content

Switch to use CUDA driver APIs in Device constructor #460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jun 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 30 additions & 14 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from cuda.core.experimental._utils.cuda_utils import (
ComputeCapability,
CUDAError,
_check_driver_error,
driver,
handle_return,
precondition,
Expand Down Expand Up @@ -930,6 +931,10 @@ def multicast_supported(self) -> bool:
return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED))


_SUCCESS = driver.CUresult.CUDA_SUCCESS
_INVALID_CTX = driver.CUresult.CUDA_ERROR_INVALID_CONTEXT


class Device:
"""Represent a GPU and act as an entry point for cuda.core features.

Expand Down Expand Up @@ -959,7 +964,7 @@ class Device:

__slots__ = ("_id", "_mr", "_has_inited", "_properties")

def __new__(cls, device_id=None):
def __new__(cls, device_id: Optional[int] = None):
global _is_cuInit
if _is_cuInit is False:
with _lock:
Expand All @@ -968,26 +973,34 @@ def __new__(cls, device_id=None):

# important: creating a Device instance does not initialize the GPU!
if device_id is None:
device_id = handle_return(runtime.cudaGetDevice())
assert_type(device_id, int)
else:
total = handle_return(runtime.cudaGetDeviceCount())
assert_type(device_id, int)
if not (0 <= device_id < total):
raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
err, dev = driver.cuCtxGetDevice()
if err == _SUCCESS:
device_id = int(dev)
elif err == _INVALID_CTX:
ctx = handle_return(driver.cuCtxGetCurrent())
assert int(ctx) == 0
device_id = 0 # cudart behavior
else:
_check_driver_error(err)
elif device_id < 0:
raise ValueError(f"device_id must be >= 0, got {device_id}")

# ensure Device is singleton
if not hasattr(_tls, "devices"):
total = handle_return(runtime.cudaGetDeviceCount())
_tls.devices = []
try:
devices = _tls.devices
except AttributeError:
total = handle_return(driver.cuDeviceGetCount())
devices = _tls.devices = []
for dev_id in range(total):
dev = super().__new__(cls)
dev._id = dev_id
# If the device is in TCC mode, or does not support memory pools for some other reason,
# use the SynchronousMemoryResource which does not use memory pools.
if (
handle_return(
runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
driver.cuDeviceGetAttribute(
driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
)
)
) == 1:
dev._mr = _DefaultAsyncMempool(dev_id)
Expand All @@ -996,9 +1009,12 @@ def __new__(cls, device_id=None):

dev._has_inited = False
dev._properties = None
_tls.devices.append(dev)
devices.append(dev)

return _tls.devices[device_id]
try:
return devices[device_id]
except IndexError:
raise ValueError(f"device_id must be within [0, {len(devices)}), got {device_id}") from None

def _check_context_initialized(self, *args, **kwargs):
if not self._has_inited:
Expand Down
5 changes: 3 additions & 2 deletions cuda_core/docs/source/release/0.3.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ New features

- :class:`Kernel` adds :attr:`Kernel.num_arguments` and :attr:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
- Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648)
- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`.
- Support launching cooperative kernels by setting :attr:`LaunchConfig.cooperative_launch` to `True`.
- A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective
options.

Expand All @@ -34,5 +34,6 @@ New examples
Fixes and enhancements
----------------------

- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
- Look-up of the :attr:`Event.device` and :attr:`Event.context` (the device and CUDA context where an event was created from) is now possible.
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed.
- The :class:`Device` constructor is made faster.
Loading