Skip to content

Cythonize away some perf hot spots #709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jul 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,21 @@ class ContextOptions:
pass # TODO


class Context:
__slots__ = ("_handle", "_id")
cdef class Context:

def __new__(self, *args, **kwargs):
cdef:
readonly object _handle
int _device_id

def __init__(self, *args, **kwargs):
raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.")

@classmethod
def _from_ctx(cls, obj, dev_id):
assert_type(obj, driver.CUcontext)
ctx = super().__new__(cls)
ctx._handle = obj
ctx._id = dev_id
def _from_ctx(cls, handle: driver.CUcontext, int device_id):
cdef Context ctx = Context.__new__(Context)
ctx._handle = handle
ctx._device_id = device_id
return ctx

def __eq__(self, other):
return int(self._handle) == int(other._handle)
45 changes: 31 additions & 14 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
_check_driver_error,
driver,
handle_return,
precondition,
runtime,
)

Expand Down Expand Up @@ -1017,12 +1016,31 @@ def __new__(cls, device_id: Optional[int] = None):
except IndexError:
raise ValueError(f"device_id must be within [0, {len(devices)}), got {device_id}") from None

def _check_context_initialized(self, *args, **kwargs):
def _check_context_initialized(self):
if not self._has_inited:
raise CUDAError(
f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
)

def _get_current_context(self, check_consistency=False) -> driver.CUcontext:
err, ctx = driver.cuCtxGetCurrent()

# TODO: We want to just call this:
# _check_driver_error(err)
# but even the simplest success check causes 50-100 ns. Wait until we cythonize this file...
if ctx is None:
_check_driver_error(err)

if int(ctx) == 0:
raise CUDAError("No context is bound to the calling CPU thread.")
if check_consistency:
err, dev = driver.cuCtxGetDevice()
if err != _SUCCESS:
handle_return((err,))
if int(dev) != self._id:
raise CUDAError("Internal error (current device is not equal to Device.device_id)")
return ctx

@property
def device_id(self) -> int:
"""Return device ordinal."""
Expand Down Expand Up @@ -1083,7 +1101,6 @@ def compute_capability(self) -> ComputeCapability:
return cc

@property
@precondition(_check_context_initialized)
def context(self) -> Context:
"""Return the current :obj:`~_context.Context` associated with this device.

Expand All @@ -1092,9 +1109,8 @@ def context(self) -> Context:
Device must be initialized.

"""
ctx = handle_return(driver.cuCtxGetCurrent())
if int(ctx) == 0:
raise CUDAError("No context is bound to the calling CPU thread.")
self._check_context_initialized()
ctx = self._get_current_context(check_consistency=True)
return Context._from_ctx(ctx, self._id)

@property
Expand Down Expand Up @@ -1206,8 +1222,7 @@ def create_context(self, options: ContextOptions = None) -> Context:
"""
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")

@precondition(_check_context_initialized)
def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions = None) -> Stream:
def create_stream(self, obj: Optional[IsStreamT] = None, options: Optional[StreamOptions] = None) -> Stream:
"""Create a Stream object.

New stream objects can be created in two different ways:
Expand Down Expand Up @@ -1235,9 +1250,9 @@ def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions
Newly created stream object.

"""
return Stream._init(obj=obj, options=options)
self._check_context_initialized()
return Stream._init(obj=obj, options=options, device_id=self._id)

@precondition(_check_context_initialized)
def create_event(self, options: Optional[EventOptions] = None) -> Event:
"""Create an Event object without recording it to a Stream.

Expand All @@ -1256,9 +1271,10 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event:
Newly created event object.

"""
return Event._init(self._id, self.context._handle, options)
self._check_context_initialized()
ctx = self._get_current_context()
return Event._init(self._id, ctx, options)

@precondition(_check_context_initialized)
def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
"""Allocate device memory from a specified stream.

Expand All @@ -1285,11 +1301,11 @@ def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
Newly created buffer object.

"""
self._check_context_initialized()
if stream is None:
stream = default_stream()
return self._mr.allocate(size, stream)

@precondition(_check_context_initialized)
def sync(self):
"""Synchronize the device.

Expand All @@ -1298,9 +1314,9 @@ def sync(self):
Device must be initialized.

"""
self._check_context_initialized()
handle_return(runtime.cudaDeviceSynchronize())

@precondition(_check_context_initialized)
def create_graph_builder(self) -> GraphBuilder:
"""Create a new :obj:`~_graph.GraphBuilder` object.

Expand All @@ -1310,4 +1326,5 @@ def create_graph_builder(self) -> GraphBuilder:
Newly created graph builder object.

"""
self._check_context_initialized()
return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True)
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,28 @@

from __future__ import annotations

import weakref
from cuda.core.experimental._utils.cuda_utils cimport (
_check_driver_error as raise_if_driver_error,
check_or_create_options,
)

from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

from cuda.core.experimental._context import Context
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
check_or_create_options,
driver,
handle_return,
)
from cuda.core.experimental._utils.cuda_utils import (
_check_driver_error as raise_if_driver_error,
)

if TYPE_CHECKING:
import cuda.bindings
from cuda.core.experimental._device import Device


@dataclass
class EventOptions:
cdef class EventOptions:
"""Customizable :obj:`~_event.Event` options.

Attributes
Expand All @@ -49,7 +49,7 @@ class EventOptions:
support_ipc: Optional[bool] = False


class Event:
cdef class Event:
"""Represent a record at a specific point of execution within a CUDA stream.

Applications can asynchronously record events at any point in
Expand Down Expand Up @@ -77,49 +77,46 @@ class Event:
and they should instead be created through a :obj:`~_stream.Stream` object.

"""

class _MembersNeededForFinalize:
__slots__ = ("handle",)

def __init__(self, event_obj, handle):
self.handle = handle
weakref.finalize(event_obj, self.close)

def close(self):
if self.handle is not None:
handle_return(driver.cuEventDestroy(self.handle))
self.handle = None

def __new__(self, *args, **kwargs):
cdef:
object _handle
bint _timing_disabled
bint _busy_waited
int _device_id
object _ctx_handle

def __init__(self, *args, **kwargs):
raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")

__slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited", "_device_id", "_ctx_handle")

@classmethod
def _init(cls, device_id: int, ctx_handle: Context, options: Optional[EventOptions] = None):
self = super().__new__(cls)
self._mnff = Event._MembersNeededForFinalize(self, None)

options = check_or_create_options(EventOptions, options, "Event options")
def _init(cls, device_id: int, ctx_handle: Context, options=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since Event contains native class members, perhaps adding __cinit__ to initialize them is appropriate. Something like

    def __cinit__(self):
        self._timing_disabled = False
        self._busy_waited = False
        self._device_id = -1

I also think it would be safe to set object class members to None.

This would ensure that Event.__new__(Event) would return an initialized struct.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Cython sets everything to None for us, but it'd be good to verify this indeed

Cython additionally takes responsibility of setting all object attributes to None,

https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#initialisation-methods-cinit-and-init

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, let's leave object members out. Should I push adding Event.__cinit__ ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the same section says all members are zero/null initialized?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but is it appropriate to zero initialize _device_id? Perhaps it does not matter much.

cdef Event self = Event.__new__(Event)
cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options")
flags = 0x0
self._timing_disabled = False
self._busy_waited = False
if not options.enable_timing:
if not opts.enable_timing:
flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING
self._timing_disabled = True
if options.busy_waited_sync:
if opts.busy_waited_sync:
flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
self._busy_waited = True
if options.support_ipc:
if opts.support_ipc:
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
self._mnff.handle = handle_return(driver.cuEventCreate(flags))
err, self._handle = driver.cuEventCreate(flags)
raise_if_driver_error(err)
self._device_id = device_id
self._ctx_handle = ctx_handle
return self

def close(self):
cpdef close(self):
"""Destroy the event."""
self._mnff.close()
if self._handle is not None:
err, = driver.cuEventDestroy(self._handle)
self._handle = None
raise_if_driver_error(err)

def __del__(self):
self.close()

def __isub__(self, other):
return NotImplemented
Expand All @@ -129,7 +126,7 @@ def __rsub__(self, other):

def __sub__(self, other):
# return self - other (in milliseconds)
err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
err, timing = driver.cuEventElapsedTime(other.handle, self._handle)
try:
raise_if_driver_error(err)
return timing
Expand Down Expand Up @@ -180,12 +177,12 @@ def sync(self):
has been completed.

"""
handle_return(driver.cuEventSynchronize(self._mnff.handle))
handle_return(driver.cuEventSynchronize(self._handle))

@property
def is_done(self) -> bool:
"""Return True if all captured works have been completed, otherwise False."""
(result,) = driver.cuEventQuery(self._mnff.handle)
result, = driver.cuEventQuery(self._handle)
if result == driver.CUresult.CUDA_SUCCESS:
return True
if result == driver.CUresult.CUDA_ERROR_NOT_READY:
Expand All @@ -201,7 +198,7 @@ def handle(self) -> cuda.bindings.driver.CUevent:
This handle is a Python object. To get the memory address of the underlying C
handle, call ``int(Event.handle)``.
"""
return self._mnff.handle
return self._handle

@property
def device(self) -> Device:
Expand Down
Loading
Loading