[FEA]: Implement `cuda::kernel_ref` and `cuda::function_ref` #2993

davebayer · 2024-12-01T14:49:39Z

Is this a duplicate?

I confirmed there appear to be no duplicate issues for this request and that I agree to the Code of Conduct

Area

CUDA Experimental (cudax)

Is your feature request related to a problem? Please describe.

CUDAX already provides C++ wrappers for streams, devices, events and others. It does not provide wrappers for kernels and functions.

Describe the solution you'd like

I'd like to expand the API of wrappers for CUkernel and CUfunction implemented as kernel_ref and function_ref. Both are non-owning references to the original types implemented via the CUDA Driver API.

#if CUDA_VERSION >= 12000

namespace cuda::experimental
{
  template<class>
  class kernel_ref;

  template<class... _Args>
  class kernel_ref<void(_Args...)>
  {
  public:
    using value_type = ::cudaKernel_t;

    kernel_ref(_CUDA_VSTD::nullptr_t) = delete; // Delete construction from nullptr

    constexpr kernel_ref(value_type __kernel) noexcept
    {
      // Set __kernel_ to __kernel
    }

    kernel_ref(void (*__entry_func_address)(_Args...))
    {
      // Get __kernel_ from __entry_func_address via cudaGetKernel
    }

    _CCCL_NODISCARD ::std::string_view get_name() const
    {
      // Get __kernel_ name via cuKernelGetName
    }

    _CCCL_NODISCARD constexpr value_type get() const noexcept
    {
      // Get __kernel_ value
    }

    _CCCL_NODISCARD CUlibrary get_library() const
    {
      // Get __kernel_'s library via cuKernelGetLibrary
    }

    template<class _Attr>
    _CCCL_NODISCARD auto get_attr(const _Attr& __attr, device_ref __dev) const
    {
      // Get __attr value for __dev via cuKernelGetAttribute
    }

    template<class _Attr, class _Value>
    void set_attr(const _Attr& __attr, _Value&& __value, device_ref __dev) const
    {
      // Check __value type
      // Set __attr __value for __dev via cuKernelSetAttribute
    }

    template<class _CacheConfig>
    void set_cache_config(_CacheConfig __cacheConfig, device_ref __dev) const
    {
      // Set __cacheConfig for __dev via cuKernelSetCacheConfig
    }

    _CCCL_NODISCARD_FRIEND constexpr bool operator==(kernel_ref __lhs, kernel_ref __rhs) noexcept
    {
      // Compare __lhs.__kernel_ and __rhs.__kernel_ for equality
    }
  private:
    value_type __kernel_{};
  };
} // namespace cuda::experimental

#endif // CUDA_VERSION >= 12000

template<class>
  class function_ref;

  template<class... _Args>
  class function_ref<void(_Args...)>
  {
  public:
    using value_type = ::cudaFunction_t;

    function_ref(_CUDA_VSTD::nullptr_t) = delete; // Delete construction from nullptr

    constexpr function_ref(value_type __function) noexcept
    {
      // Set __function_ to __function
    }

#if CUDA_VERSION >= 12000
    function_ref(kernel_ref<void(_Args...)> __kernel)
    {
      // Obtain __function_ for the current context from __kernel
    }

    function_ref(kernel_ref<void(_Args...)> __kernel, CUcontext __context)
    {
      // Obtain __function_ for __context from __kernel via cuKernelGetFunction
    }
#endif // CUDA_VERSION >= 12000

    _CCCL_NODISCARD ::std::string_view get_name() const
    {
      // Get __function_ name via cuFuncGetName
    }

    _CCCL_NODISCARD constexpr value_type get() const noexcept
    {
      // Get __function_ value
    }

    _CCCL_NODISCARD CUmodule get_module() const
    {
      // Get __function_'s module via cuFunctionGetModule
    }

    template<class _Attr>
    _CCCL_NODISCARD auto get_attr(const _Attr& __attr) const
    {
      // Get __attr value via cuFunctionGetAttribute
    }

    template<class _Attr, class _Value>
    void set_attr(const _Attr& __attr, _Value&& __value) const
    {
      // Check __value type
      // Set __attr __value via cuFunctionSetAttribute
    }

    template<class _CacheConfig>
    void set_cache_config(_CacheConfig __cacheConfig) const
    {
      // Set __cacheConfig via cuFunctionSetCacheConfig
    }

    _CCCL_NODISCARD bool is_loaded() const
    {
      // Check if __function_ is loaded via cuFuncIsLoaded
    }

    void load() const
    {
      // Load __function_ via cuFuncLoad
    }

    _CCCL_NODISCARD_FRIEND constexpr bool operator==(function_ref __lhs, function_ref __rhs) noexcept
    {
      // Compare __lhs.__function_ and __rhs.__function_ for equality
    }
  private:
    value_type __function_{};
  };

#if CUDA_VERSION >= 12000
template <class... ExpArgs, class... ActArgs, class... Levels>
void launch(
  ::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, ::cuda::kernel_ref<void(ExpArgs...)> kernel, ActArgs&&... args)
{
  // Obtain the CUfunction for stream's context via cuKernelGetFunction
  // Launch function in stream using the provided dimensions and arguments via cuLaunchKernel
}
#endif // CUDA_VERSION >= 12000

template <class... ExpArgs, class... ActArgs, class... Levels>
void launch(
  ::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, ::cuda::function_ref<void(ExpArgs...)> function, ActArgs&&... args)
{
  // Launch function in stream using the provided dimensions and arguments via cuLaunchKernel
}

In the future return values of kernel_ref::get_library() and function_ref::get_module() could be replaced with library_ref and module_ref if implemented.

Describe alternatives you've considered

No response

Additional context

No response

The text was updated successfully, but these errors were encountered:

bernhardmgruber · 2024-12-02T06:36:53Z

I like the suggstion. @pciolkosz is also working on wrapping some of the driver APIs, so let's see what he thinks of the suggestion.

pciolkosz · 2024-12-03T17:53:56Z

We will definitely need something like this for dynamic loading. I'm not sure yet if we want only kernel/library or function/module as well.

davebayer · 2024-12-04T10:41:16Z

In case we want function and modules as well, I was wondering whether it would be a good idea to rename the cuda::function_ref to something like cuda::context_kernel_ref or something similar because I think naming it as a function is quite confusing as it means something totally different in C++.

lamarrr · 2024-12-08T01:49:21Z

I'd like to take on this, cc: @jrhemstad

davebayer · 2024-12-08T06:42:13Z

I'd like to take on this, cc: @jrhemstad

I have the implementation almost finalized missing overloads for launch(...) and tests. See my fork here.

If you want to finalize it yourself, feel free to copy the branch :)

pciolkosz · 2024-12-10T03:37:20Z

@davebayer I took a quick look at the linked branch. I asked around and thought about the function/module and I think we should start without them and see if anyone complains.
The general direction in CUDA is moving away from the context, this is why library/kernel was introduced. The plan for cudax is to have the model from the current CUDA Runtime where the context is implicit and we operate on devices instead, unless we see a very strong case that will convince us otherwise.
In the above context (I don't mean CUDA context here), I think it makes sense to start without the constructs kernel/library replaced. When we see a strong motivating case where they are still needed, we can reconsider that decision.

Another thing I wanted to improve are kernel attributes. I have seen multiple bugs resulting from races between different threads updating the shared memory size related attributes. Ideally these would be read-only and instead part of a kernel configuration if non-default value is needed. But that would require changes to cudaLanchConfig_t, so for now maybe attribute setter is fine and we remove it later once the launch options are available?

davebayer added the feature request New feature or request. label Dec 1, 2024

github-project-automation bot added this to CCCL Dec 1, 2024

github-project-automation bot moved this to Todo in CCCL Dec 1, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[FEA]: Implement `cuda::kernel_ref` and `cuda::function_ref` #2993

[FEA]: Implement `cuda::kernel_ref` and `cuda::function_ref` #2993

davebayer commented Dec 1, 2024 •

edited

Loading

bernhardmgruber commented Dec 2, 2024

pciolkosz commented Dec 3, 2024

davebayer commented Dec 4, 2024

lamarrr commented Dec 8, 2024

davebayer commented Dec 8, 2024 •

edited

Loading

pciolkosz commented Dec 10, 2024

[FEA]: Implement cuda::kernel_ref and cuda::function_ref #2993

[FEA]: Implement cuda::kernel_ref and cuda::function_ref #2993

Comments

davebayer commented Dec 1, 2024 • edited Loading

Is this a duplicate?

Area

Is your feature request related to a problem? Please describe.

Describe the solution you'd like

Describe alternatives you've considered

Additional context

bernhardmgruber commented Dec 2, 2024

pciolkosz commented Dec 3, 2024

davebayer commented Dec 4, 2024

lamarrr commented Dec 8, 2024

davebayer commented Dec 8, 2024 • edited Loading

pciolkosz commented Dec 10, 2024

[FEA]: Implement `cuda::kernel_ref` and `cuda::function_ref` #2993

[FEA]: Implement `cuda::kernel_ref` and `cuda::function_ref` #2993

davebayer commented Dec 1, 2024 •

edited

Loading

davebayer commented Dec 8, 2024 •

edited

Loading