From 47b8f5ccdf46358b27fbf156b5dab509fc6ebdac Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Wed, 7 Aug 2024 12:40:56 -0700 Subject: [PATCH] [CUDAX] add `__launch_transform` to transform arguments to `cudax::launch` prior to launching the kernel (#2202) * add `__launch_transform` to transform arguments to `cudax::launch` prior to launching the kernel --- .../cuda/experimental/__detail/utility.cuh | 20 ++++ .../cuda/experimental/__launch/launch.cuh | 94 +++++++++++++------ .../__launch/launch_transform.cuh | 83 ++++++++++++++++ .../__utility/ensure_current_device.cuh | 2 +- cudax/test/launch/launch_smoke.cu | 56 +++++++++++ 5 files changed, 223 insertions(+), 32 deletions(-) create mode 100644 cudax/include/cuda/experimental/__launch/launch_transform.cuh diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh index 874075b1075..738a5d6244b 100644 --- a/cudax/include/cuda/experimental/__detail/utility.cuh +++ b/cudax/include/cuda/experimental/__detail/utility.cuh @@ -11,8 +11,28 @@ #ifndef __CUDAX_DETAIL_UTILITY_H #define __CUDAX_DETAIL_UTILITY_H +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + namespace cuda::experimental { +namespace detail +{ +struct __ignore +{ + template + _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept + {} +}; +} // namespace detail + struct uninit_t { explicit uninit_t() = default; diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh index 1a49cafa405..f4aee8a173d 100644 --- a/cudax/include/cuda/experimental/__launch/launch.cuh +++ b/cudax/include/cuda/experimental/__launch/launch.cuh @@ -16,6 +16,7 @@ #include #include +#include #include #if _CCCL_STD_VER >= 2017 @@ -120,18 +121,25 @@ template & conf, const Kernel& kernel, Args... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); + __ensure_current_device __dev_setter(stream); cudaError_t status; - if constexpr (::cuda::std::is_invocable_v, Args...>) + if constexpr (::cuda::std::is_invocable_v, as_kernel_arg_t...>) { - auto launcher = detail::kernel_launcher, Kernel, Args...>; - status = detail::launch_impl(stream, conf, launcher, conf, kernel, args...); + auto launcher = detail::kernel_launcher, Kernel, as_kernel_arg_t...>; + status = detail::launch_impl( + stream, + conf, + launcher, + conf, + kernel, + static_cast>(detail::__launch_transform(stream, args))...); } else { - static_assert(::cuda::std::is_invocable_v); - auto launcher = detail::kernel_launcher_no_config; - status = detail::launch_impl(stream, conf, launcher, kernel, args...); + static_assert(::cuda::std::is_invocable_v...>); + auto launcher = detail::kernel_launcher_no_config...>; + status = detail::launch_impl( + stream, conf, launcher, kernel, static_cast>(detail::__launch_transform(stream, args))...); } if (status != cudaSuccess) { @@ -183,18 +191,29 @@ void launch( template void launch(::cuda::stream_ref stream, const hierarchy_dimensions& dims, const Kernel& kernel, Args... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); + __ensure_current_device __dev_setter(stream); cudaError_t status; - if constexpr (::cuda::std::is_invocable_v, Args...>) + if constexpr (::cuda::std::is_invocable_v, as_kernel_arg_t...>) { - auto launcher = detail::kernel_launcher, Kernel, Args...>; - status = detail::launch_impl(stream, kernel_config(dims), launcher, dims, kernel, args...); + auto launcher = detail::kernel_launcher, Kernel, as_kernel_arg_t...>; + status = detail::launch_impl( + stream, + kernel_config(dims), + launcher, + dims, + kernel, + static_cast>(detail::__launch_transform(stream, args))...); } else { - static_assert(::cuda::std::is_invocable_v); - auto launcher = detail::kernel_launcher_no_config; - status = detail::launch_impl(stream, kernel_config(dims), launcher, kernel, args...); + static_assert(::cuda::std::is_invocable_v...>); + auto launcher = detail::kernel_launcher_no_config...>; + status = detail::launch_impl( + stream, + kernel_config(dims), + launcher, + kernel, + static_cast>(detail::__launch_transform(stream, args))...); } if (status != cudaSuccess) { @@ -248,10 +267,14 @@ void launch(::cuda::stream_ref stream, void (*kernel)(kernel_config, ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, conf, kernel, conf, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, // + conf, + kernel, + conf, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); @@ -303,10 +326,14 @@ void launch(::cuda::stream_ref stream, void (*kernel)(hierarchy_dimensions, ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, kernel_config(dims), kernel, dims, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, + kernel_config(dims), + kernel, + dims, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); @@ -320,7 +347,6 @@ void launch(::cuda::stream_ref stream, * Kernel function is a function with __global__ annotation. * Function might or might not accept the configuration as its first argument. * - * * @par Snippet * @code * #include @@ -359,10 +385,13 @@ void launch(::cuda::stream_ref stream, void (*kernel)(ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, conf, kernel, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, // + conf, + kernel, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); @@ -412,10 +441,13 @@ template void launch( ::cuda::stream_ref stream, const hierarchy_dimensions& dims, void (*kernel)(ExpArgs...), ActArgs&&... args) { - [[maybe_unused]] __ensure_current_device __dev_setter(stream); - cudaError_t status = [&](ExpArgs... args) { - return detail::launch_impl(stream, kernel_config(dims), kernel, args...); - }(std::forward(args)...); + __ensure_current_device __dev_setter(stream); + cudaError_t status = detail::launch_impl( + stream, + kernel_config(dims), + kernel, + static_cast>(detail::__launch_transform(stream, std::forward(args)))...); + if (status != cudaSuccess) { ::cuda::__throw_cuda_error(status, "Failed to launch a kernel"); diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh new file mode 100644 index 00000000000..4692cf93764 --- /dev/null +++ b/cudax/include/cuda/experimental/__launch/launch_transform.cuh @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__LAUNCH_LAUNCH_TRANSFORM +#define _CUDAX__LAUNCH_LAUNCH_TRANSFORM +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +#if _CCCL_STD_VER >= 2017 +namespace cuda::experimental +{ +namespace detail +{ +// Types should define overloads of __cudax_launch_transform that are find-able +// by ADL in order to customize how cudax::launch handles that type. The +// overload below, which simply returns the argument unmodified, is the overload +// that gets chosen if no other overload matches. It takes __ignore as the first +// argument to make this overload less preferred than other overloads that take +// a stream_ref as the first argument. +template +_CCCL_NODISCARD constexpr _Arg&& __cudax_launch_transform(__ignore, _Arg&& __arg) noexcept +{ + return _CUDA_VSTD::forward<_Arg>(__arg); +} + +template +using __launch_transform_direct_result_t = + decltype(__cudax_launch_transform(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>())); + +struct __fn +{ + template + _CCCL_NODISCARD __launch_transform_direct_result_t<_Arg> operator()(::cuda::stream_ref __stream, _Arg&& __arg) const + { + // This call is unqualified to allow ADL + return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg)); + } +}; + +template +struct __as_kernel_arg +{ + using type = _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>; +}; + +template +struct __as_kernel_arg< + _Arg, + _CUDA_VSTD::void_t>::__as_kernel_arg>> +{ + using type = typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg; +}; + +_CCCL_GLOBAL_CONSTANT __fn __launch_transform{}; +} // namespace detail + +template +using as_kernel_arg_t = typename detail::__as_kernel_arg<_Arg>::type; + +} // namespace cuda::experimental + +#endif // _CCCL_STD_VER >= 2017 +#endif // !_CUDAX__LAUNCH_LAUNCH_TRANSFORM diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh index 2431d028187..839adafb96c 100644 --- a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh +++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh @@ -33,7 +33,7 @@ namespace cuda::experimental //! @brief RAII helper which on construction sets the current device to the specified one or one a //! stream was created under. It sets the state back on destruction. //! -struct __ensure_current_device +struct [[maybe_unused]] __ensure_current_device { //! @brief Construct a new `__ensure_current_device` object and switch to the specified //! device. diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index 810e65c3908..29d84d2e7c9 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -104,6 +104,50 @@ struct dynamic_smem_span } }; +struct launch_transform_to_int_convertible +{ + int value_; + + struct int_convertible + { + cudaStream_t stream_; + int value_; + + int_convertible(cudaStream_t stream, int value) noexcept + : stream_(stream) + , value_(value) + { + // Check that the constructor runs before the kernel is launched + CHECK_FALSE(kernel_run_proof); + } + + // Immovable to ensure that __launch_transform doesn't copy the returned + // object + int_convertible(int_convertible&&) = delete; + + ~int_convertible() noexcept + { + // Check that the destructor runs after the kernel is launched + CUDART(cudaStreamSynchronize(stream_)); + CHECK(kernel_run_proof); + } + + using __as_kernel_arg = int; + + // This is the value that will be passed to the kernel + explicit operator int() const + { + return value_; + } + }; + + _CCCL_NODISCARD_FRIEND int_convertible + __cudax_launch_transform(::cuda::stream_ref stream, launch_transform_to_int_convertible self) noexcept + { + return int_convertible(stream.get(), self.value_); + } +}; + // Needs a separe function for Windows extended lambda void launch_smoke_test() { @@ -127,10 +171,14 @@ void launch_smoke_test() check_kernel_run(stream); cudax::launch(stream, dims_or_conf, kernel_int_argument, 1); check_kernel_run(stream); + cudax::launch(stream, dims_or_conf, kernel_int_argument, launch_transform_to_int_convertible{1}); + check_kernel_run(stream); cudax::launch(stream, dims_or_conf, functor_int_argument(), dummy); check_kernel_run(stream); cudax::launch(stream, dims_or_conf, functor_int_argument(), 1); check_kernel_run(stream); + cudax::launch(stream, dims_or_conf, functor_int_argument(), launch_transform_to_int_convertible{1}); + check_kernel_run(stream); cudax::launch(stream, dims_or_conf, kernel_int_argument, 1U); check_kernel_run(stream); @@ -150,11 +198,15 @@ void launch_smoke_test() check_kernel_run(stream); cudax::launch(stream, config, functor_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, config, functor_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, config, kernel_instance, grid_size); check_kernel_run(stream); cudax::launch(stream, config, kernel_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, config, kernel_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, config, functor_instance, static_cast(grid_size)); check_kernel_run(stream); @@ -171,11 +223,15 @@ void launch_smoke_test() check_kernel_run(stream); cudax::launch(stream, dimensions, functor_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, dimensions, functor_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, dimensions, kernel_instance, grid_size); check_kernel_run(stream); cudax::launch(stream, dimensions, kernel_instance, ::cuda::std::move(grid_size)); check_kernel_run(stream); + cudax::launch(stream, dimensions, kernel_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(stream); cudax::launch(stream, dimensions, functor_instance, static_cast(grid_size)); check_kernel_run(stream);