From 47b8f5ccdf46358b27fbf156b5dab509fc6ebdac Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Wed, 7 Aug 2024 12:40:56 -0700
Subject: [PATCH] [CUDAX] add `__launch_transform` to transform arguments to
 `cudax::launch` prior to launching the kernel (#2202)

* add `__launch_transform` to transform arguments to `cudax::launch` prior to launching the kernel
---
 .../cuda/experimental/__detail/utility.cuh    | 20 ++++
 .../cuda/experimental/__launch/launch.cuh     | 94 +++++++++++++------
 .../__launch/launch_transform.cuh             | 83 ++++++++++++++++
 .../__utility/ensure_current_device.cuh       |  2 +-
 cudax/test/launch/launch_smoke.cu             | 56 +++++++++++
 5 files changed, 223 insertions(+), 32 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__launch/launch_transform.cuh
diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh
index 874075b1075..738a5d6244b 100644
--- a/cudax/include/cuda/experimental/__detail/utility.cuh
+++ b/cudax/include/cuda/experimental/__detail/utility.cuh
@@ -11,8 +11,28 @@
 #ifndef __CUDAX_DETAIL_UTILITY_H
 #define __CUDAX_DETAIL_UTILITY_H
 
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 namespace cuda::experimental
 {
+namespace detail
+{
+struct __ignore
+{
+  template <typename... Args>
+  _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept
+  {}
+};
+} // namespace detail
+
 struct uninit_t
 {
   explicit uninit_t() = default;
diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
index 1a49cafa405..f4aee8a173d 100644
--- a/cudax/include/cuda/experimental/__launch/launch.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -16,6 +16,7 @@
 #include <cuda/stream_ref>
 
 #include <cuda/experimental/__launch/configuration.cuh>
+#include <cuda/experimental/__launch/launch_transform.cuh>
 #include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 #if _CCCL_STD_VER >= 2017
@@ -120,18 +121,25 @@ template <typename... Args, typename... Config, typename Dimensions, typename Ke
 void launch(
   ::cuda::stream_ref stream, const kernel_config<Dimensions, Config...>& conf, const Kernel& kernel, Args... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
+  __ensure_current_device __dev_setter(stream);
   cudaError_t status;
-  if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, Args...>)
+  if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, as_kernel_arg_t<Args>...>)
   {
-    auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, Args...>;
-    status        = detail::launch_impl(stream, conf, launcher, conf, kernel, args...);
+    auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream,
+      conf,
+      launcher,
+      conf,
+      kernel,
+      static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   else
   {
-    static_assert(::cuda::std::is_invocable_v<Kernel, Args...>);
-    auto launcher = detail::kernel_launcher_no_config<Kernel, Args...>;
-    status        = detail::launch_impl(stream, conf, launcher, kernel, args...);
+    static_assert(::cuda::std::is_invocable_v<Kernel, as_kernel_arg_t<Args>...>);
+    auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream, conf, launcher, kernel, static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   if (status != cudaSuccess)
   {
@@ -183,18 +191,29 @@ void launch(
 template <typename... Args, typename... Levels, typename Kernel>
 void launch(::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, const Kernel& kernel, Args... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
+  __ensure_current_device __dev_setter(stream);
   cudaError_t status;
-  if constexpr (::cuda::std::is_invocable_v<Kernel, hierarchy_dimensions<Levels...>, Args...>)
+  if constexpr (::cuda::std::is_invocable_v<Kernel, hierarchy_dimensions<Levels...>, as_kernel_arg_t<Args>...>)
   {
-    auto launcher = detail::kernel_launcher<hierarchy_dimensions<Levels...>, Kernel, Args...>;
-    status        = detail::launch_impl(stream, kernel_config(dims), launcher, dims, kernel, args...);
+    auto launcher = detail::kernel_launcher<hierarchy_dimensions<Levels...>, Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream,
+      kernel_config(dims),
+      launcher,
+      dims,
+      kernel,
+      static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   else
   {
-    static_assert(::cuda::std::is_invocable_v<Kernel, Args...>);
-    auto launcher = detail::kernel_launcher_no_config<Kernel, Args...>;
-    status        = detail::launch_impl(stream, kernel_config(dims), launcher, kernel, args...);
+    static_assert(::cuda::std::is_invocable_v<Kernel, as_kernel_arg_t<Args>...>);
+    auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
+    status        = detail::launch_impl(
+      stream,
+      kernel_config(dims),
+      launcher,
+      kernel,
+      static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, args))...);
   }
   if (status != cudaSuccess)
   {
@@ -248,10 +267,14 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(kernel_config<Dimensions, Config...>, ExpArgs...),
             ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, conf, kernel, conf, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream, //
+    conf,
+    kernel,
+    conf,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
@@ -303,10 +326,14 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(hierarchy_dimensions<Levels...>, ExpArgs...),
             ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, kernel_config(dims), kernel, dims, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream,
+    kernel_config(dims),
+    kernel,
+    dims,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
@@ -320,7 +347,6 @@ void launch(::cuda::stream_ref stream,
  * Kernel function is a function with __global__ annotation.
  * Function might or might not accept the configuration as its first argument.
  *
- *
  * @par Snippet
  * @code
  * #include <cstdio>
@@ -359,10 +385,13 @@ void launch(::cuda::stream_ref stream,
             void (*kernel)(ExpArgs...),
             ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, conf, kernel, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream, //
+    conf,
+    kernel,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
@@ -412,10 +441,13 @@ template <typename... ExpArgs, typename... ActArgs, typename... Levels>
 void launch(
   ::cuda::stream_ref stream, const hierarchy_dimensions<Levels...>& dims, void (*kernel)(ExpArgs...), ActArgs&&... args)
 {
-  [[maybe_unused]] __ensure_current_device __dev_setter(stream);
-  cudaError_t status = [&](ExpArgs... args) {
-    return detail::launch_impl(stream, kernel_config(dims), kernel, args...);
-  }(std::forward<ActArgs>(args)...);
+  __ensure_current_device __dev_setter(stream);
+  cudaError_t status = detail::launch_impl(
+    stream,
+    kernel_config(dims),
+    kernel,
+    static_cast<as_kernel_arg_t<ActArgs>>(detail::__launch_transform(stream, std::forward<ActArgs>(args)))...);
+
   if (status != cudaSuccess)
   {
     ::cuda::__throw_cuda_error(status, "Failed to launch a kernel");
diff --git a/cudax/include/cuda/experimental/__launch/launch_transform.cuh b/cudax/include/cuda/experimental/__launch/launch_transform.cuh
new file mode 100644
index 00000000000..4692cf93764
--- /dev/null
+++ b/cudax/include/cuda/experimental/__launch/launch_transform.cuh
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__LAUNCH_LAUNCH_TRANSFORM
+#define _CUDAX__LAUNCH_LAUNCH_TRANSFORM
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/decay.h>
+#include <cuda/std/utility>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+#if _CCCL_STD_VER >= 2017
+namespace cuda::experimental
+{
+namespace detail
+{
+// Types should define overloads of __cudax_launch_transform that are find-able
+// by ADL in order to customize how cudax::launch handles that type. The
+// overload below, which simply returns the argument unmodified, is the overload
+// that gets chosen if no other overload matches. It takes __ignore as the first
+// argument to make this overload less preferred than other overloads that take
+// a stream_ref as the first argument.
+template <typename _Arg>
+_CCCL_NODISCARD constexpr _Arg&& __cudax_launch_transform(__ignore, _Arg&& __arg) noexcept
+{
+  return _CUDA_VSTD::forward<_Arg>(__arg);
+}
+
+template <typename _Arg>
+using __launch_transform_direct_result_t =
+  decltype(__cudax_launch_transform(::cuda::stream_ref{}, _CUDA_VSTD::declval<_Arg>()));
+
+struct __fn
+{
+  template <typename _Arg>
+  _CCCL_NODISCARD __launch_transform_direct_result_t<_Arg> operator()(::cuda::stream_ref __stream, _Arg&& __arg) const
+  {
+    // This call is unqualified to allow ADL
+    return __cudax_launch_transform(__stream, _CUDA_VSTD::forward<_Arg>(__arg));
+  }
+};
+
+template <typename _Arg, typename _Enable = void>
+struct __as_kernel_arg
+{
+  using type = _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>;
+};
+
+template <typename _Arg>
+struct __as_kernel_arg<
+  _Arg,
+  _CUDA_VSTD::void_t<typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg>>
+{
+  using type = typename _CUDA_VSTD::decay_t<__launch_transform_direct_result_t<_Arg>>::__as_kernel_arg;
+};
+
+_CCCL_GLOBAL_CONSTANT __fn __launch_transform{};
+} // namespace detail
+
+template <typename _Arg>
+using as_kernel_arg_t = typename detail::__as_kernel_arg<_Arg>::type;
+
+} // namespace cuda::experimental
+
+#endif // _CCCL_STD_VER >= 2017
+#endif // !_CUDAX__LAUNCH_LAUNCH_TRANSFORM
diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
index 2431d028187..839adafb96c 100644
--- a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
+++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
@@ -33,7 +33,7 @@ namespace cuda::experimental
 //! @brief RAII helper which on construction sets the current device to the specified one or one a
 //! stream was created under. It sets the state back on destruction.
 //!
-struct __ensure_current_device
+struct [[maybe_unused]] __ensure_current_device
 {
   //! @brief Construct a new `__ensure_current_device` object and switch to the specified
   //!        device.
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index 810e65c3908..29d84d2e7c9 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -104,6 +104,50 @@ struct dynamic_smem_span
   }
 };
 
+struct launch_transform_to_int_convertible
+{
+  int value_;
+
+  struct int_convertible
+  {
+    cudaStream_t stream_;
+    int value_;
+
+    int_convertible(cudaStream_t stream, int value) noexcept
+        : stream_(stream)
+        , value_(value)
+    {
+      // Check that the constructor runs before the kernel is launched
+      CHECK_FALSE(kernel_run_proof);
+    }
+
+    // Immovable to ensure that __launch_transform doesn't copy the returned
+    // object
+    int_convertible(int_convertible&&) = delete;
+
+    ~int_convertible() noexcept
+    {
+      // Check that the destructor runs after the kernel is launched
+      CUDART(cudaStreamSynchronize(stream_));
+      CHECK(kernel_run_proof);
+    }
+
+    using __as_kernel_arg = int;
+
+    // This is the value that will be passed to the kernel
+    explicit operator int() const
+    {
+      return value_;
+    }
+  };
+
+  _CCCL_NODISCARD_FRIEND int_convertible
+  __cudax_launch_transform(::cuda::stream_ref stream, launch_transform_to_int_convertible self) noexcept
+  {
+    return int_convertible(stream.get(), self.value_);
+  }
+};
+
 // Needs a separe function for Windows extended lambda
 void launch_smoke_test()
 {
@@ -127,10 +171,14 @@ void launch_smoke_test()
         check_kernel_run(stream);
         cudax::launch(stream, dims_or_conf, kernel_int_argument, 1);
         check_kernel_run(stream);
+        cudax::launch(stream, dims_or_conf, kernel_int_argument, launch_transform_to_int_convertible{1});
+        check_kernel_run(stream);
         cudax::launch(stream, dims_or_conf, functor_int_argument(), dummy);
         check_kernel_run(stream);
         cudax::launch(stream, dims_or_conf, functor_int_argument(), 1);
         check_kernel_run(stream);
+        cudax::launch(stream, dims_or_conf, functor_int_argument(), launch_transform_to_int_convertible{1});
+        check_kernel_run(stream);
 
         cudax::launch(stream, dims_or_conf, kernel_int_argument, 1U);
         check_kernel_run(stream);
@@ -150,11 +198,15 @@ void launch_smoke_test()
       check_kernel_run(stream);
       cudax::launch(stream, config, functor_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, config, functor_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, config, kernel_instance, grid_size);
       check_kernel_run(stream);
       cudax::launch(stream, config, kernel_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, config, kernel_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, config, functor_instance, static_cast<unsigned int>(grid_size));
       check_kernel_run(stream);
@@ -171,11 +223,15 @@ void launch_smoke_test()
       check_kernel_run(stream);
       cudax::launch(stream, dimensions, functor_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, dimensions, functor_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, dimensions, kernel_instance, grid_size);
       check_kernel_run(stream);
       cudax::launch(stream, dimensions, kernel_instance, ::cuda::std::move(grid_size));
       check_kernel_run(stream);
+      cudax::launch(stream, dimensions, kernel_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(stream);
 
       cudax::launch(stream, dimensions, functor_instance, static_cast<unsigned int>(grid_size));
       check_kernel_run(stream);