Skip to content

WIP: Cythonize away some perf hot spots #709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,21 @@ class ContextOptions:
pass # TODO


class Context:
__slots__ = ("_handle", "_id")
cdef class Context:

def __new__(self, *args, **kwargs):
cdef:
readonly object _handle
int _device_id

def __init__(self, *args, **kwargs):
raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.")

@classmethod
def _from_ctx(cls, obj, dev_id):
assert_type(obj, driver.CUcontext)
ctx = super().__new__(cls)
ctx._handle = obj
ctx._id = dev_id
def _from_ctx(cls, handle: driver.CUcontext, int device_id):
cdef Context ctx = Context.__new__(Context)
ctx._handle = handle
ctx._device_id = device_id
return ctx

def __eq__(self, other):
return int(self._handle) == int(other._handle)
45 changes: 31 additions & 14 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
_check_driver_error,
driver,
handle_return,
precondition,
runtime,
)

Expand Down Expand Up @@ -1017,12 +1016,31 @@ def __new__(cls, device_id: Optional[int] = None):
except IndexError:
raise ValueError(f"device_id must be within [0, {len(devices)}), got {device_id}") from None

def _check_context_initialized(self, *args, **kwargs):
def _check_context_initialized(self):
if not self._has_inited:
raise CUDAError(
f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
)

def _get_current_context(self, check_consistency=False) -> driver.CUcontext:
err, ctx = driver.cuCtxGetCurrent()

# TODO: We want to just call this:
# _check_driver_error(err)
# but even the simplest success check causes 50-100 ns. Wait until we cythonize this file...
if ctx is None:
_check_driver_error(err)

if int(ctx) == 0:
raise CUDAError("No context is bound to the calling CPU thread.")
if check_consistency:
err, dev = driver.cuCtxGetDevice()
if err != _SUCCESS:
handle_return((err,))
if int(dev) != self._id:
raise CUDAError("Internal error (current device is not equal to Device.device_id)")
return ctx

@property
def device_id(self) -> int:
"""Return device ordinal."""
Expand Down Expand Up @@ -1083,7 +1101,6 @@ def compute_capability(self) -> ComputeCapability:
return cc

@property
@precondition(_check_context_initialized)
def context(self) -> Context:
"""Return the current :obj:`~_context.Context` associated with this device.

Expand All @@ -1092,9 +1109,8 @@ def context(self) -> Context:
Device must be initialized.

"""
ctx = handle_return(driver.cuCtxGetCurrent())
if int(ctx) == 0:
raise CUDAError("No context is bound to the calling CPU thread.")
self._check_context_initialized()
ctx = self._get_current_context(check_consistency=True)
return Context._from_ctx(ctx, self._id)

@property
Expand Down Expand Up @@ -1206,8 +1222,7 @@ def create_context(self, options: ContextOptions = None) -> Context:
"""
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")

@precondition(_check_context_initialized)
def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions = None) -> Stream:
def create_stream(self, obj: Optional[IsStreamT] = None, options: Optional[StreamOptions] = None) -> Stream:
"""Create a Stream object.

New stream objects can be created in two different ways:
Expand Down Expand Up @@ -1235,9 +1250,9 @@ def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions
Newly created stream object.

"""
return Stream._init(obj=obj, options=options)
self._check_context_initialized()
return Stream._init(obj=obj, options=options, device_id=self._id)

@precondition(_check_context_initialized)
def create_event(self, options: Optional[EventOptions] = None) -> Event:
"""Create an Event object without recording it to a Stream.

Expand All @@ -1256,9 +1271,10 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event:
Newly created event object.

"""
return Event._init(self._id, self.context._handle, options)
self._check_context_initialized()
ctx = self._get_current_context()
return Event._init(self._id, ctx, options)

@precondition(_check_context_initialized)
def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
"""Allocate device memory from a specified stream.

Expand All @@ -1285,11 +1301,11 @@ def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
Newly created buffer object.

"""
self._check_context_initialized()
if stream is None:
stream = default_stream()
return self._mr.allocate(size, stream)

@precondition(_check_context_initialized)
def sync(self):
"""Synchronize the device.

Expand All @@ -1298,9 +1314,9 @@ def sync(self):
Device must be initialized.

"""
self._check_context_initialized()
handle_return(runtime.cudaDeviceSynchronize())

@precondition(_check_context_initialized)
def create_graph_builder(self) -> GraphBuilder:
"""Create a new :obj:`~_graph.GraphBuilder` object.

Expand All @@ -1310,4 +1326,5 @@ def create_graph_builder(self) -> GraphBuilder:
Newly created graph builder object.

"""
self._check_context_initialized()
return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True)
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,28 @@

from __future__ import annotations

import weakref
from cuda.core.experimental._utils.cuda_utils cimport (
_check_driver_error as raise_if_driver_error,
check_or_create_options,
)

from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

from cuda.core.experimental._context import Context
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
check_or_create_options,
driver,
handle_return,
)
from cuda.core.experimental._utils.cuda_utils import (
_check_driver_error as raise_if_driver_error,
)

if TYPE_CHECKING:
import cuda.bindings
from cuda.core.experimental._device import Device


@dataclass
class EventOptions:
cdef class EventOptions:
"""Customizable :obj:`~_event.Event` options.

Attributes
Expand All @@ -49,7 +49,7 @@ class EventOptions:
support_ipc: Optional[bool] = False


class Event:
cdef class Event:
"""Represent a record at a specific point of execution within a CUDA stream.

Applications can asynchronously record events at any point in
Expand Down Expand Up @@ -77,49 +77,46 @@ class Event:
and they should instead be created through a :obj:`~_stream.Stream` object.

"""

class _MembersNeededForFinalize:
__slots__ = ("handle",)

def __init__(self, event_obj, handle):
self.handle = handle
weakref.finalize(event_obj, self.close)

def close(self):
if self.handle is not None:
handle_return(driver.cuEventDestroy(self.handle))
self.handle = None

def __new__(self, *args, **kwargs):
cdef:
object _handle
bint _timing_disabled
bint _busy_waited
int _device_id
object _ctx_handle

def __init__(self, *args, **kwargs):
raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")

__slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited", "_device_id", "_ctx_handle")

@classmethod
def _init(cls, device_id: int, ctx_handle: Context, options: Optional[EventOptions] = None):
self = super().__new__(cls)
self._mnff = Event._MembersNeededForFinalize(self, None)

options = check_or_create_options(EventOptions, options, "Event options")
def _init(cls, device_id: int, ctx_handle: Context, options=None):
cdef Event self = Event.__new__(Event)
cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options")
flags = 0x0
self._timing_disabled = False
self._busy_waited = False
if not options.enable_timing:
if not opts.enable_timing:
flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING
self._timing_disabled = True
if options.busy_waited_sync:
if opts.busy_waited_sync:
flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
self._busy_waited = True
if options.support_ipc:
if opts.support_ipc:
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
self._mnff.handle = handle_return(driver.cuEventCreate(flags))
err, self._handle = driver.cuEventCreate(flags)
raise_if_driver_error(err)
self._device_id = device_id
self._ctx_handle = ctx_handle
return self

def close(self):
cpdef close(self):
"""Destroy the event."""
self._mnff.close()
if self._handle is not None:
err, = driver.cuEventDestroy(self._handle)
raise_if_driver_error(err)
self._handle = None

def __del__(self):
self.close()

def __isub__(self, other):
return NotImplemented
Expand All @@ -129,7 +126,7 @@ def __rsub__(self, other):

def __sub__(self, other):
# return self - other (in milliseconds)
err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
err, timing = driver.cuEventElapsedTime(other.handle, self._handle)
try:
raise_if_driver_error(err)
return timing
Expand Down Expand Up @@ -180,12 +177,12 @@ def sync(self):
has been completed.

"""
handle_return(driver.cuEventSynchronize(self._mnff.handle))
handle_return(driver.cuEventSynchronize(self._handle))

@property
def is_done(self) -> bool:
"""Return True if all captured works have been completed, otherwise False."""
(result,) = driver.cuEventQuery(self._mnff.handle)
result, = driver.cuEventQuery(self._handle)
if result == driver.CUresult.CUDA_SUCCESS:
return True
if result == driver.CUresult.CUDA_ERROR_NOT_READY:
Expand All @@ -201,7 +198,7 @@ def handle(self) -> cuda.bindings.driver.CUevent:
This handle is a Python object. To get the memory address of the underlying C
handle, call ``int(Event.handle)``.
"""
return self._mnff.handle
return self._handle

@property
def device(self) -> Device:
Expand Down
Loading
Loading