diff --git a/cuda_core/cuda/core/experimental/_context.py b/cuda_core/cuda/core/experimental/_context.pyx similarity index 55% rename from cuda_core/cuda/core/experimental/_context.py rename to cuda_core/cuda/core/experimental/_context.pyx index 24e06d69c..d6abf65c1 100644 --- a/cuda_core/cuda/core/experimental/_context.py +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -13,16 +13,21 @@ class ContextOptions: pass # TODO -class Context: - __slots__ = ("_handle", "_id") +cdef class Context: - def __new__(self, *args, **kwargs): + cdef: + readonly object _handle + int _device_id + + def __init__(self, *args, **kwargs): raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.") @classmethod - def _from_ctx(cls, obj, dev_id): - assert_type(obj, driver.CUcontext) - ctx = super().__new__(cls) - ctx._handle = obj - ctx._id = dev_id + def _from_ctx(cls, handle: driver.CUcontext, int device_id): + cdef Context ctx = Context.__new__(Context) + ctx._handle = handle + ctx._device_id = device_id return ctx + + def __eq__(self, other): + return int(self._handle) == int(other._handle) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index c9a786070..1a71d998f 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -17,7 +17,6 @@ _check_driver_error, driver, handle_return, - precondition, runtime, ) @@ -1017,12 +1016,31 @@ def __new__(cls, device_id: Optional[int] = None): except IndexError: raise ValueError(f"device_id must be within [0, {len(devices)}), got {device_id}") from None - def _check_context_initialized(self, *args, **kwargs): + def _check_context_initialized(self): if not self._has_inited: raise CUDAError( f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) + def _get_current_context(self, check_consistency=False) -> driver.CUcontext: + err, ctx = driver.cuCtxGetCurrent() + + # TODO: We want to just call this: + # _check_driver_error(err) + # but even the simplest success check causes 50-100 ns. Wait until we cythonize this file... + if ctx is None: + _check_driver_error(err) + + if int(ctx) == 0: + raise CUDAError("No context is bound to the calling CPU thread.") + if check_consistency: + err, dev = driver.cuCtxGetDevice() + if err != _SUCCESS: + handle_return((err,)) + if int(dev) != self._id: + raise CUDAError("Internal error (current device is not equal to Device.device_id)") + return ctx + @property def device_id(self) -> int: """Return device ordinal.""" @@ -1083,7 +1101,6 @@ def compute_capability(self) -> ComputeCapability: return cc @property - @precondition(_check_context_initialized) def context(self) -> Context: """Return the current :obj:`~_context.Context` associated with this device. @@ -1092,9 +1109,8 @@ def context(self) -> Context: Device must be initialized. """ - ctx = handle_return(driver.cuCtxGetCurrent()) - if int(ctx) == 0: - raise CUDAError("No context is bound to the calling CPU thread.") + self._check_context_initialized() + ctx = self._get_current_context(check_consistency=True) return Context._from_ctx(ctx, self._id) @property @@ -1206,8 +1222,7 @@ def create_context(self, options: ContextOptions = None) -> Context: """ raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") - @precondition(_check_context_initialized) - def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions = None) -> Stream: + def create_stream(self, obj: Optional[IsStreamT] = None, options: Optional[StreamOptions] = None) -> Stream: """Create a Stream object. New stream objects can be created in two different ways: @@ -1235,9 +1250,9 @@ def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions Newly created stream object. """ - return Stream._init(obj=obj, options=options) + self._check_context_initialized() + return Stream._init(obj=obj, options=options, device_id=self._id) - @precondition(_check_context_initialized) def create_event(self, options: Optional[EventOptions] = None) -> Event: """Create an Event object without recording it to a Stream. @@ -1256,9 +1271,10 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event: Newly created event object. """ - return Event._init(self._id, self.context._handle, options) + self._check_context_initialized() + ctx = self._get_current_context() + return Event._init(self._id, ctx, options) - @precondition(_check_context_initialized) def allocate(self, size, stream: Optional[Stream] = None) -> Buffer: """Allocate device memory from a specified stream. @@ -1285,11 +1301,11 @@ def allocate(self, size, stream: Optional[Stream] = None) -> Buffer: Newly created buffer object. """ + self._check_context_initialized() if stream is None: stream = default_stream() return self._mr.allocate(size, stream) - @precondition(_check_context_initialized) def sync(self): """Synchronize the device. @@ -1298,9 +1314,9 @@ def sync(self): Device must be initialized. """ + self._check_context_initialized() handle_return(runtime.cudaDeviceSynchronize()) - @precondition(_check_context_initialized) def create_graph_builder(self) -> GraphBuilder: """Create a new :obj:`~_graph.GraphBuilder` object. @@ -1310,4 +1326,5 @@ def create_graph_builder(self) -> GraphBuilder: Newly created graph builder object. """ + self._check_context_initialized() return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.pyx similarity index 83% rename from cuda_core/cuda/core/experimental/_event.py rename to cuda_core/cuda/core/experimental/_event.pyx index 800f34c9a..74ac2bb89 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -4,20 +4,20 @@ from __future__ import annotations -import weakref +from cuda.core.experimental._utils.cuda_utils cimport ( + _check_driver_error as raise_if_driver_error, + check_or_create_options, +) + from dataclasses import dataclass from typing import TYPE_CHECKING, Optional from cuda.core.experimental._context import Context from cuda.core.experimental._utils.cuda_utils import ( CUDAError, - check_or_create_options, driver, handle_return, ) -from cuda.core.experimental._utils.cuda_utils import ( - _check_driver_error as raise_if_driver_error, -) if TYPE_CHECKING: import cuda.bindings @@ -25,7 +25,7 @@ @dataclass -class EventOptions: +cdef class EventOptions: """Customizable :obj:`~_event.Event` options. Attributes @@ -49,7 +49,7 @@ class EventOptions: support_ipc: Optional[bool] = False -class Event: +cdef class Event: """Represent a record at a specific point of execution within a CUDA stream. Applications can asynchronously record events at any point in @@ -77,49 +77,46 @@ class Event: and they should instead be created through a :obj:`~_stream.Stream` object. """ - - class _MembersNeededForFinalize: - __slots__ = ("handle",) - - def __init__(self, event_obj, handle): - self.handle = handle - weakref.finalize(event_obj, self.close) - - def close(self): - if self.handle is not None: - handle_return(driver.cuEventDestroy(self.handle)) - self.handle = None - - def __new__(self, *args, **kwargs): + cdef: + object _handle + bint _timing_disabled + bint _busy_waited + int _device_id + object _ctx_handle + + def __init__(self, *args, **kwargs): raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).") - __slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited", "_device_id", "_ctx_handle") - @classmethod - def _init(cls, device_id: int, ctx_handle: Context, options: Optional[EventOptions] = None): - self = super().__new__(cls) - self._mnff = Event._MembersNeededForFinalize(self, None) - - options = check_or_create_options(EventOptions, options, "Event options") + def _init(cls, device_id: int, ctx_handle: Context, options=None): + cdef Event self = Event.__new__(Event) + cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") flags = 0x0 self._timing_disabled = False self._busy_waited = False - if not options.enable_timing: + if not opts.enable_timing: flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING self._timing_disabled = True - if options.busy_waited_sync: + if opts.busy_waited_sync: flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC self._busy_waited = True - if options.support_ipc: + if opts.support_ipc: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") - self._mnff.handle = handle_return(driver.cuEventCreate(flags)) + err, self._handle = driver.cuEventCreate(flags) + raise_if_driver_error(err) self._device_id = device_id self._ctx_handle = ctx_handle return self - def close(self): + cpdef close(self): """Destroy the event.""" - self._mnff.close() + if self._handle is not None: + err, = driver.cuEventDestroy(self._handle) + raise_if_driver_error(err) + self._handle = None + + def __del__(self): + self.close() def __isub__(self, other): return NotImplemented @@ -129,7 +126,7 @@ def __rsub__(self, other): def __sub__(self, other): # return self - other (in milliseconds) - err, timing = driver.cuEventElapsedTime(other.handle, self.handle) + err, timing = driver.cuEventElapsedTime(other.handle, self._handle) try: raise_if_driver_error(err) return timing @@ -180,12 +177,12 @@ def sync(self): has been completed. """ - handle_return(driver.cuEventSynchronize(self._mnff.handle)) + handle_return(driver.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - (result,) = driver.cuEventQuery(self._mnff.handle) + result, = driver.cuEventQuery(self._handle) if result == driver.CUresult.CUDA_SUCCESS: return True if result == driver.CUresult.CUDA_ERROR_NOT_READY: @@ -201,7 +198,7 @@ def handle(self) -> cuda.bindings.driver.CUevent: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Event.handle)``. """ - return self._mnff.handle + return self._handle @property def device(self) -> Device: diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.pyx similarity index 80% rename from cuda_core/cuda/core/experimental/_stream.py rename to cuda_core/cuda/core/experimental/_stream.pyx index ea488f9fc..dc8d8e942 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -4,9 +4,13 @@ from __future__ import annotations +from cuda.core.experimental._utils.cuda_utils cimport ( + _check_driver_error as raise_if_driver_error, + check_or_create_options, +) + import os import warnings -import weakref from dataclasses import dataclass from typing import TYPE_CHECKING, Optional, Protocol, Tuple, Union @@ -18,16 +22,14 @@ from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( - check_or_create_options, driver, get_device_from_ctx, handle_return, - runtime, ) @dataclass -class StreamOptions: +cdef class StreamOptions: """Customizable :obj:`~_stream.Stream` options. Attributes @@ -85,7 +87,7 @@ def _try_to_get_stream_ptr(obj: IsStreamT): return driver.CUstream(info[1]) -class Stream: +cdef class Stream: """Represent a queue of GPU operations that are executed in a specific order. Applications use streams to control the order of execution for @@ -103,35 +105,27 @@ class Stream: """ - class _MembersNeededForFinalize: - __slots__ = ("handle", "owner", "builtin") - - def __init__(self, stream_obj, handle, owner, builtin): - self.handle = handle - self.owner = owner - self.builtin = builtin - weakref.finalize(stream_obj, self.close) + cdef: + object _handle + object _owner + object _builtin + object _nonblocking + object _priority + object _device_id + object _ctx_handle - def close(self): - if self.owner is None: - if self.handle and not self.builtin: - handle_return(driver.cuStreamDestroy(self.handle)) - else: - self.owner = None - self.handle = None - - def __new__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): raise RuntimeError( "Stream objects cannot be instantiated directly. " "Please use Device APIs (create_stream) or other Stream APIs (from_handle)." ) - __slots__ = ("__weakref__", "_mnff", "_nonblocking", "_priority", "_device_id", "_ctx_handle") - @classmethod def _legacy_default(cls): - self = super().__new__(cls) - self._mnff = Stream._MembersNeededForFinalize(self, driver.CUstream(driver.CU_STREAM_LEGACY), None, True) + cdef Stream self = Stream.__new__(Stream) + self._handle = driver.CUstream(driver.CU_STREAM_LEGACY) + self._owner = None + self._builtin = True self._nonblocking = None # delayed self._priority = None # delayed self._device_id = None # delayed @@ -140,8 +134,10 @@ def _legacy_default(cls): @classmethod def _per_thread_default(cls): - self = super().__new__(cls) - self._mnff = Stream._MembersNeededForFinalize(self, driver.CUstream(driver.CU_STREAM_PER_THREAD), None, True) + cdef Stream self = Stream.__new__(Stream) + self._handle = driver.CUstream(driver.CU_STREAM_PER_THREAD) + self._owner = None + self._builtin = True self._nonblocking = None # delayed self._priority = None # delayed self._device_id = None # delayed @@ -149,57 +145,65 @@ def _per_thread_default(cls): return self @classmethod - def _init(cls, obj: Optional[IsStreamT] = None, *, options: Optional[StreamOptions] = None): - self = super().__new__(cls) - self._mnff = Stream._MembersNeededForFinalize(self, None, None, False) + def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): + cdef Stream self = Stream.__new__(Stream) + self._handle = None + self._owner = None + self._builtin = False if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - self._mnff.handle = _try_to_get_stream_ptr(obj) + self._handle = _try_to_get_stream_ptr(obj) # TODO: check if obj is created under the current context/device - self._mnff.owner = obj + self._owner = obj self._nonblocking = None # delayed self._priority = None # delayed self._device_id = None # delayed self._ctx_handle = None # delayed return self - options = check_or_create_options(StreamOptions, options, "Stream options") - nonblocking = options.nonblocking - priority = options.priority + cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options") + nonblocking = opts.nonblocking + priority = opts.priority flags = driver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else driver.CUstream_flags.CU_STREAM_DEFAULT - - high, low = handle_return(runtime.cudaDeviceGetStreamPriorityRange()) + err, high, low = driver.cuCtxGetStreamPriorityRange() + raise_if_driver_error(err) if priority is not None: if not (low <= priority <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") else: priority = high - self._mnff.handle = handle_return(driver.cuStreamCreateWithPriority(flags, priority)) - self._mnff.owner = None + self._handle = handle_return(driver.cuStreamCreateWithPriority(flags, priority)) + self._owner = None self._nonblocking = nonblocking self._priority = priority - # don't defer this because we will have to pay a cost for context - # switch later - self._device_id = int(handle_return(driver.cuCtxGetDevice())) + self._device_id = device_id self._ctx_handle = None # delayed return self - def close(self): + def __del__(self): + self.close() + + cpdef close(self): """Destroy the stream. Destroy the stream if we own it. Borrowed foreign stream object will instead have their references released. """ - self._mnff.close() + if self._owner is None: + if self._handle and not self._builtin: + handle_return(driver.cuStreamDestroy(self._handle)) + else: + self._owner = None + self._handle = None def __cuda_stream__(self) -> Tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" - return (0, self.handle) + return (0, int(self.handle)) @property def handle(self) -> cuda.bindings.driver.CUstream: @@ -210,13 +214,13 @@ def handle(self) -> cuda.bindings.driver.CUstream: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Stream.handle)``. """ - return self._mnff.handle + return self._handle @property def is_nonblocking(self) -> bool: """Return True if this is a nonblocking stream, otherwise False.""" if self._nonblocking is None: - flag = handle_return(driver.cuStreamGetFlags(self._mnff.handle)) + flag = handle_return(driver.cuStreamGetFlags(self._handle)) if flag == driver.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: @@ -227,13 +231,13 @@ def is_nonblocking(self) -> bool: def priority(self) -> int: """Return the stream priority.""" if self._priority is None: - prio = handle_return(driver.cuStreamGetPriority(self._mnff.handle)) + prio = handle_return(driver.cuStreamGetPriority(self._handle)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" - handle_return(driver.cuStreamSynchronize(self._mnff.handle)) + handle_return(driver.cuStreamSynchronize(self._handle)) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -258,9 +262,10 @@ def record(self, event: Event = None, options: EventOptions = None) -> Event: # on the stream. Event flags such as disabling timing, nonblocking, # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. if event is None: + self._get_device_and_context() event = Event._init(self._device_id, self._ctx_handle, options) - assert_type(event, Event) - handle_return(driver.cuEventRecord(event.handle, self._mnff.handle)) + err, = driver.cuEventRecord(event.handle, self._handle) + raise_if_driver_error(err) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -281,7 +286,7 @@ def wait(self, event_or_stream: Union[Event, Stream]): stream = event_or_stream else: try: - stream = Stream._init(event_or_stream) + stream = Stream._init(obj=event_or_stream) except Exception as e: raise ValueError( "only an Event, Stream, or object supporting __cuda_stream__ can be waited," @@ -292,7 +297,7 @@ def wait(self, event_or_stream: Union[Event, Stream]): discard_event = True # TODO: support flags other than 0? - handle_return(driver.cuStreamWaitEvent(self._mnff.handle, event, 0)) + handle_return(driver.cuStreamWaitEvent(self._handle, event, 0)) if discard_event: handle_return(driver.cuEventDestroy(event)) @@ -308,21 +313,27 @@ def device(self) -> Device: """ from cuda.core.experimental._device import Device # avoid circular import + self._get_device_and_context() + return Device(self._device_id) + + cdef int _get_context(Stream self) except?-1: + if self._ctx_handle is None: + err, self._ctx_handle = driver.cuStreamGetCtx(self._handle) + raise_if_driver_error(err) + return 0 + cdef int _get_device_and_context(Stream self) except?-1: if self._device_id is None: # Get the stream context first - if self._ctx_handle is None: - self._ctx_handle = handle_return(driver.cuStreamGetCtx(self._mnff.handle)) + self._get_context() self._device_id = get_device_from_ctx(self._ctx_handle) - return Device(self._device_id) + return 0 @property def context(self) -> Context: """Return the :obj:`~_context.Context` associated with this stream.""" - if self._ctx_handle is None: - self._ctx_handle = handle_return(driver.cuStreamGetCtx(self._mnff.handle)) - if self._device_id is None: - self._device_id = get_device_from_ctx(self._ctx_handle) + self._get_context() + self._get_device_and_context() return Context._from_ctx(self._ctx_handle, self._device_id) @staticmethod diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.pxd b/cuda_core/cuda/core/experimental/_utils/__init__.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd new file mode 100644 index 000000000..1dc6bd1eb --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +cpdef int _check_driver_error(error) except?-1 +cpdef int _check_runtime_error(error) except?-1 +cpdef int _check_nvrtc_error(error) except?-1 +cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*) diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.py b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx similarity index 76% rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.py rename to cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index 48b48d2fb..f14addd43 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.py +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -1,12 +1,12 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import functools import importlib.metadata from collections import namedtuple from collections.abc import Sequence -from typing import Callable, Dict +from typing import Callable try: from cuda.bindings import driver, nvrtc, runtime @@ -52,90 +52,98 @@ def _reduce_3_tuple(t: tuple): return t[0] * t[1] * t[2] -def _check_driver_error(error): - if error == driver.CUresult.CUDA_SUCCESS: - return +cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS +cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess +cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS + + +cpdef inline int _check_driver_error(error) except?-1: + if error == _DRIVER_SUCCESS: + return 0 name_err, name = driver.cuGetErrorName(error) - if name_err != driver.CUresult.CUDA_SUCCESS: + if name_err != _DRIVER_SUCCESS: raise CUDAError(f"UNEXPECTED ERROR CODE: {error}") name = name.decode() expl = DRIVER_CU_RESULT_EXPLANATIONS.get(int(error)) if expl is not None: raise CUDAError(f"{name}: {expl}") desc_err, desc = driver.cuGetErrorString(error) - if desc_err != driver.CUresult.CUDA_SUCCESS: + if desc_err != _DRIVER_SUCCESS: raise CUDAError(f"{name}") desc = desc.decode() raise CUDAError(f"{name}: {desc}") -def _check_runtime_error(error): - if error == runtime.cudaError_t.cudaSuccess: - return +cpdef inline int _check_runtime_error(error) except?-1: + if error == _RUNTIME_SUCCESS: + return 0 name_err, name = runtime.cudaGetErrorName(error) - if name_err != runtime.cudaError_t.cudaSuccess: + if name_err != _RUNTIME_SUCCESS: raise CUDAError(f"UNEXPECTED ERROR CODE: {error}") name = name.decode() expl = RUNTIME_CUDA_ERROR_EXPLANATIONS.get(int(error)) if expl is not None: raise CUDAError(f"{name}: {expl}") desc_err, desc = runtime.cudaGetErrorString(error) - if desc_err != runtime.cudaError_t.cudaSuccess: + if desc_err != _RUNTIME_SUCCESS: raise CUDAError(f"{name}") desc = desc.decode() raise CUDAError(f"{name}: {desc}") -def _check_error(error, handle=None): +cpdef inline int _check_nvrtc_error(error, handle=None) except?-1: + if error == _NVRTC_SUCCESS: + return 0 + err = f"{error}: {nvrtc.nvrtcGetErrorString(error)[1].decode()}" + if handle is not None: + _, logsize = nvrtc.nvrtcGetProgramLogSize(handle) + log = b" " * logsize + _ = nvrtc.nvrtcGetProgramLog(handle, log) + err += f", compilation log:\n\n{log.decode('utf-8', errors='backslashreplace')}" + raise NVRTCError(err) + + +cdef inline int _check_error(error, handle=None) except?-1: if isinstance(error, driver.CUresult): - _check_driver_error(error) + return _check_driver_error(error) elif isinstance(error, runtime.cudaError_t): - _check_runtime_error(error) + return _check_runtime_error(error) elif isinstance(error, nvrtc.nvrtcResult): - if error == nvrtc.nvrtcResult.NVRTC_SUCCESS: - return - err = f"{error}: {nvrtc.nvrtcGetErrorString(error)[1].decode()}" - if handle is not None: - _, logsize = nvrtc.nvrtcGetProgramLogSize(handle) - log = b" " * logsize - _ = nvrtc.nvrtcGetProgramLog(handle, log) - err += f", compilation log:\n\n{log.decode('utf-8', errors='backslashreplace')}" - raise NVRTCError(err) + return _check_nvrtc_error(error, handle=handle) else: raise RuntimeError(f"Unknown error type: {error}") -def handle_return(result, handle=None): +def handle_return(tuple result, handle=None): _check_error(result[0], handle=handle) - if len(result) == 1: + cdef int out_len = len(result) + if out_len == 1: return - elif len(result) == 2: + elif out_len == 2: return result[1] else: return result[1:] -def check_or_create_options(cls, options, options_description, *, keep_none=False): +cpdef check_or_create_options(type cls, options, str options_description="", bint keep_none=False): """ Create the specified options dataclass from a dictionary of options or None. """ - if options is None: if keep_none: return options - options = cls() - elif isinstance(options, Dict): - options = cls(**options) - - if not isinstance(options, cls): + return cls() + elif isinstance(options, cls): + return options + elif isinstance(options, dict): + return cls(**options) + else: raise TypeError( f"The {options_description} must be provided as an object " f"of type {cls.__name__} or as a dict with valid {options_description}. " f"The provided object is '{options}'." ) - return options - def _handle_boolean_option(option: bool) -> str: """ @@ -144,7 +152,7 @@ def _handle_boolean_option(option: bool) -> str: return "true" if bool(option) else "false" -def precondition(checker: Callable[..., None], what: str = "") -> Callable: +def precondition(checker: Callable[..., None], str what="") -> Callable: """ A decorator that adds checks to ensure any preconditions are met. diff --git a/cuda_core/setup.py b/cuda_core/setup.py index f2005c3dd..26b9a5ab0 100644 --- a/cuda_core/setup.py +++ b/cuda_core/setup.py @@ -2,28 +2,32 @@ # # SPDX-License-Identifier: Apache-2.0 +import glob import os from Cython.Build import cythonize from setuptools import Extension, setup from setuptools.command.build_ext import build_ext as _build_ext -ext_modules = ( - Extension( - "cuda.core.experimental._dlpack", - sources=["cuda/core/experimental/_dlpack.pyx"], - language="c++", - ), - Extension( - "cuda.core.experimental._memoryview", - sources=["cuda/core/experimental/_memoryview.pyx"], - language="c++", - ), +# It seems setuptools' wildcard support has problems for namespace packages, +# so we explicitly spell out all Extension instances. +root_module = "cuda.core.experimental" +root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep +ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True) + + +def strip_prefix_suffix(filename): + return filename[len(root_path) : -4] + + +module_names = (strip_prefix_suffix(f) for f in ext_files) +ext_modules = tuple( Extension( - "cuda.core.experimental._kernel_arg_handler", - sources=["cuda/core/experimental/_kernel_arg_handler.pyx"], + f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", + sources=[f"cuda/core/experimental/{mod}.pyx"], language="c++", - ), + ) + for mod in module_names ) diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py index 5f94e545f..77d9e457a 100644 --- a/cuda_core/tests/test_cuda_utils.py +++ b/cuda_core/tests/test_cuda_utils.py @@ -44,7 +44,7 @@ def test_check_driver_error(): num_unexpected = 0 for error in driver.CUresult: if error == driver.CUresult.CUDA_SUCCESS: - assert cuda_utils._check_driver_error(error) is None + assert cuda_utils._check_driver_error(error) == 0 else: with pytest.raises(cuda_utils.CUDAError) as e: cuda_utils._check_driver_error(error) @@ -63,7 +63,7 @@ def test_check_runtime_error(): num_unexpected = 0 for error in runtime.cudaError_t: if error == runtime.cudaError_t.cudaSuccess: - assert cuda_utils._check_runtime_error(error) is None + assert cuda_utils._check_runtime_error(error) == 0 else: with pytest.raises(cuda_utils.CUDAError) as e: cuda_utils._check_runtime_error(error) diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index a73655f1a..7a3ff8b2c 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -52,7 +52,7 @@ def test_stream_record(init_cuda): def test_stream_record_invalid_event(init_cuda): stream = Device().create_stream(options=StreamOptions()) - with pytest.raises(TypeError): + with pytest.raises(AttributeError): stream.record(event="invalid_event") @@ -80,6 +80,7 @@ def test_stream_context(init_cuda): stream = Device().create_stream(options=StreamOptions()) context = stream.context assert context is not None + assert context._handle is not None def test_stream_from_foreign_stream(init_cuda):