Skip to content

Commit

Permalink
[CUDAX] add __launch_transform to transform arguments to `cudax::la…
Browse files Browse the repository at this point in the history
…unch` prior to launching the kernel (#2202)

* add `__launch_transform` to transform arguments to `cudax::launch` prior to launching the kernel
  • Loading branch information
ericniebler authored Aug 7, 2024
1 parent 62336ad commit 47b8f5c
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 32 deletions.
20 changes: 20 additions & 0 deletions cudax/include/cuda/experimental/__detail/utility.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,28 @@
#ifndef __CUDAX_DETAIL_UTILITY_H
#define __CUDAX_DETAIL_UTILITY_H

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

namespace cuda::experimental
{
namespace detail
{
struct __ignore
{
template <typename... Args>
_CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept
{}
};
} // namespace detail

struct uninit_t
{
explicit uninit_t() = default;
Expand Down
94 changes: 63 additions & 31 deletions cudax/include/cuda/experimental/__launch/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <cuda/stream_ref>

#include <cuda/experimental/__launch/configuration.cuh>
#include <cuda/experimental/__launch/launch_transform.cuh>
#include <cuda/experimental/__utility/ensure_current_device.cuh>

#if _CCCL_STD_VER >= 2017
Expand Down Expand Up @@ -120,18 +121,25 @@ template <typename... Args, typename... Config, typename Dimensions, typename Ke
void launch(
::cuda::stream_ref stream, const kernel_config<Dimensions, Config...>& conf, const Kernel& kernel, Args... args)
{
[[maybe_unused]] __ensure_current_device __dev_setter(stream);
__ensure_current_device __dev_setter(stream);
cudaError_t status;
if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, Args...>)
if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, as_kernel_arg_t<Args>...>)
{
auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, Args...>;
status = detail::launch_impl(stream, conf, launcher, conf, kernel, args...);
auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, as_kernel_arg_t<Args>...>;
status = detail::launch_impl(
stream,
conf,
launcher,
conf,
kernel,
static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
}
else
{
static_assert(::cuda::std::is_invocable_v<Kernel, Args...>);
auto launcher = detail::kernel_launcher_no_config<Kernel, Args...>;
status = detail::launch_impl(stream, conf, launcher, kernel, args...);
static_assert(::cuda::std::is_invocable_v<Kernel, as_kernel_arg_t<Args>...>);
auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
status = detail::launch_impl(
stream, conf, launcher, kernel, static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
}
if (status != cudaSuccess)
{
Expand Down Expand Up @@ -183,18 +191,29 @@ void launch(
template <typename... Args, typename... Levels, typename Kernel>
void launch(::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, const Kernel& kernel, Args... args)
{
[[maybe_unused]] __ensure_current_device __dev_setter(stream);
__ensure_current_device __dev_setter(stream);
cudaError_t status;
if constexpr (::cuda::std::is_invocable_v<Kernel, hierarchy_dimensions<Levels...>, Args...>)
if constexpr (::cuda::std::is_invocable_v<Kernel, hierarchy_dimensions<Levels...>, as_kernel_arg_t<Args>...>)
{
auto launcher = detail::kernel_launcher<hierarchy_dimensions<Levels...>, Kernel, Args...>;
status = detail::launch_impl(stream, kernel_config(dims), launcher, dims, kernel, args...);
auto launcher = detail::kernel_launcher<hierarchy_dimensions<Levels...>, Kernel, as_kernel_arg_t<Args>...>;
status = detail::launch_impl(
stream,
kernel_config(dims),
launcher,
dims,
kernel,
static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
}
else
{
static_assert(::cuda::std::is_invocable_v<Kernel, Args...>);
auto launcher = detail::kernel_launcher_no_config<Kernel, Args...>;
status = detail::launch_impl(stream, kernel_config(dims), launcher, kernel, args...);
static_assert(::cuda::std::is_invocable_v<Kernel, as_kernel_arg_t<Args>...>);
auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
status = detail::launch_impl(
stream,
kernel_config(dims),
launcher,
kernel,
static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
}
if (status != cudaSuccess)
{
Expand Down Expand Up @@ -248,10 +267,14 @@ void launch(::cuda::stream_ref stream,
void (*kernel)(kernel_config<Dimensions, Config...>, ExpArgs...),
ActArgs&&... args)
{
[[maybe_unused]] __ensure_current_device __dev_setter(stream);
cudaError_t status = [&](ExpArgs... args) {
return detail::launch_impl(stream, conf, kernel, conf, args...);
}(std::forward<ActArgs>(args)...);
__ensure_current_device __dev_setter(stream);
cudaError_t status = detail::launch_impl(
stream, //
conf,
kernel,
conf,
static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);

if (status != cudaSuccess)
{
::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
Expand Down Expand Up @@ -303,10 +326,14 @@ void launch(::cuda::stream_ref stream,
void (*kernel)(hierarchy_dimensions<Levels...>, ExpArgs...),
ActArgs&&... args)
{
[[maybe_unused]] __ensure_current_device __dev_setter(stream);
cudaError_t status = [&](ExpArgs... args) {
return detail::launch_impl(stream, kernel_config(dims), kernel, dims, args...);
}(std::forward<ActArgs>(args)...);
__ensure_current_device __dev_setter(stream);
cudaError_t status = detail::launch_impl(
stream,
kernel_config(dims),
kernel,
dims,
static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);

if (status != cudaSuccess)
{
::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
Expand All @@ -320,7 +347,6 @@ void launch(::cuda::stream_ref stream,
* Kernel function is a function with __global__ annotation.
* Function might or might not accept the configuration as its first argument.
*
*
* @par Snippet
* @code
* #include <cstdio>
Expand Down Expand Up @@ -359,10 +385,13 @@ void launch(::cuda::stream_ref stream,
void (*kernel)(ExpArgs...),
ActArgs&&... args)
{
[[maybe_unused]] __ensure_current_device __dev_setter(stream);
cudaError_t status = [&](ExpArgs... args) {
return detail::launch_impl(stream, conf, kernel, args...);
}(std::forward<ActArgs>(args)...);
__ensure_current_device __dev_setter(stream);
cudaError_t status = detail::launch_impl(
stream, //
conf,
kernel,
static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);

if (status != cudaSuccess)
{
::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
Expand Down Expand Up @@ -412,10 +441,13 @@ template <typename... ExpArgs, typename... ActArgs, typename... Levels>
void launch(
::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, void (*kernel)(ExpArgs...), ActArgs&&... args)
{
[[maybe_unused]] __ensure_current_device __dev_setter(stream);
cudaError_t status = [&](ExpArgs... args) {
return detail::launch_impl(stream, kernel_config(dims), kernel, args...);
}(std::forward<ActArgs>(args)...);
__ensure_current_device __dev_setter(stream);
cudaError_t status = detail::launch_impl(
stream,
kernel_config(dims),
kernel,
static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);

if (status != cudaSuccess)
{
::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
Expand Down
83 changes: 83 additions & 0 deletions cudax/include/cuda/experimental/__launch/launch_transform.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX__LAUNCH_LAUNCH_TRANSFORM
#define _CUDAX__LAUNCH_LAUNCH_TRANSFORM
#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__type_traits/decay.h>
#include <cuda/std/utility>
#include <cuda/stream_ref>

#include <cuda/experimental/__detail/utility.cuh>

#if _CCCL_STD_VER >= 2017
namespace cuda::experimental
{
namespace detail
{
// Types should define overloads of __cudax_launch_transform that are find-able
// by ADL in order to customize how cudax::launch handles that type. The
// overload below, which simply returns the argument unmodified, is the overload
// that gets chosen if no other overload matches. It takes __ignore as the first
// argument to make this overload less preferred than other overloads that take
// a stream_ref as the first argument.
template <typename _Arg>
_CCCL_NODISCARD constexpr _Arg&& __cudax_launch_transform(__ignore, _Arg&& __arg) noexcept
{
return _CUDA_VSTD::forward<_Arg>(__arg);
}

template <typename _Arg>
using __launch_transform_direct_result_t =
decltype(__cudax_launch_transform(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>()));

struct __fn
{
template <typename _Arg>
_CCCL_NODISCARD __launch_transform_direct_result_t<_Arg> operator()(::cuda::stream_ref __stream, _Arg&& __arg) const
{
// This call is unqualified to allow ADL
return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg));
}
};

template <typename _Arg, typename _Enable = void>
struct __as_kernel_arg
{
using type = _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>;
};

template <typename _Arg>
struct __as_kernel_arg<
_Arg,
_CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg>>
{
using type = typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg;
};

_CCCL_GLOBAL_CONSTANT __fn __launch_transform{};
} // namespace detail

template <typename _Arg>
using as_kernel_arg_t = typename detail::__as_kernel_arg<_Arg>::type;

} // namespace cuda::experimental

#endif // _CCCL_STD_VER >= 2017
#endif // !_CUDAX__LAUNCH_LAUNCH_TRANSFORM
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ namespace cuda::experimental
//! @brief RAII helper which on construction sets the current device to the specified one or one a
//! stream was created under. It sets the state back on destruction.
//!
struct __ensure_current_device
struct [[maybe_unused]] __ensure_current_device
{
//! @brief Construct a new `__ensure_current_device` object and switch to the specified
//! device.
Expand Down
56 changes: 56 additions & 0 deletions cudax/test/launch/launch_smoke.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,50 @@ struct dynamic_smem_span
}
};

struct launch_transform_to_int_convertible
{
int value_;

struct int_convertible
{
cudaStream_t stream_;
int value_;

int_convertible(cudaStream_t stream, int value) noexcept
: stream_(stream)
, value_(value)
{
// Check that the constructor runs before the kernel is launched
CHECK_FALSE(kernel_run_proof);
}

// Immovable to ensure that __launch_transform doesn't copy the returned
// object
int_convertible(int_convertible&&) = delete;

~int_convertible() noexcept
{
// Check that the destructor runs after the kernel is launched
CUDART(cudaStreamSynchronize(stream_));
CHECK(kernel_run_proof);
}

using __as_kernel_arg = int;

// This is the value that will be passed to the kernel
explicit operator int() const
{
return value_;
}
};

_CCCL_NODISCARD_FRIEND int_convertible
__cudax_launch_transform(::cuda::stream_ref stream, launch_transform_to_int_convertible self) noexcept
{
return int_convertible(stream.get(), self.value_);
}
};

// Needs a separe function for Windows extended lambda
void launch_smoke_test()
{
Expand All @@ -127,10 +171,14 @@ void launch_smoke_test()
check_kernel_run(stream);
cudax::launch(stream, dims_or_conf, kernel_int_argument, 1);
check_kernel_run(stream);
cudax::launch(stream, dims_or_conf, kernel_int_argument, launch_transform_to_int_convertible{1});
check_kernel_run(stream);
cudax::launch(stream, dims_or_conf, functor_int_argument(), dummy);
check_kernel_run(stream);
cudax::launch(stream, dims_or_conf, functor_int_argument(), 1);
check_kernel_run(stream);
cudax::launch(stream, dims_or_conf, functor_int_argument(), launch_transform_to_int_convertible{1});
check_kernel_run(stream);

cudax::launch(stream, dims_or_conf, kernel_int_argument, 1U);
check_kernel_run(stream);
Expand All @@ -150,11 +198,15 @@ void launch_smoke_test()
check_kernel_run(stream);
cudax::launch(stream, config, functor_instance, ::cuda::std::move(grid_size));
check_kernel_run(stream);
cudax::launch(stream, config, functor_instance, launch_transform_to_int_convertible{grid_size});
check_kernel_run(stream);

cudax::launch(stream, config, kernel_instance, grid_size);
check_kernel_run(stream);
cudax::launch(stream, config, kernel_instance, ::cuda::std::move(grid_size));
check_kernel_run(stream);
cudax::launch(stream, config, kernel_instance, launch_transform_to_int_convertible{grid_size});
check_kernel_run(stream);

cudax::launch(stream, config, functor_instance, static_cast<unsigned int>(grid_size));
check_kernel_run(stream);
Expand All @@ -171,11 +223,15 @@ void launch_smoke_test()
check_kernel_run(stream);
cudax::launch(stream, dimensions, functor_instance, ::cuda::std::move(grid_size));
check_kernel_run(stream);
cudax::launch(stream, dimensions, functor_instance, launch_transform_to_int_convertible{grid_size});
check_kernel_run(stream);

cudax::launch(stream, dimensions, kernel_instance, grid_size);
check_kernel_run(stream);
cudax::launch(stream, dimensions, kernel_instance, ::cuda::std::move(grid_size));
check_kernel_run(stream);
cudax::launch(stream, dimensions, kernel_instance, launch_transform_to_int_convertible{grid_size});
check_kernel_run(stream);

cudax::launch(stream, dimensions, functor_instance, static_cast<unsigned int>(grid_size));
check_kernel_run(stream);
Expand Down

0 comments on commit 47b8f5c

Please sign in to comment.