[CUDAX] Add combine API to kernel_config and allow adding default con…

…figuration to kernel functors (#3082) * WIP * Default kernel configuration * Default kernel config * Docs and one more test case * Fix clang and format * Make MSVC happy
NVIDIA · Dec 11, 2024 · c6cc227 · c6cc227
1 parent 6e8bfc7
commit c6cc227
Show file tree

Hide file tree

Showing 5 changed files with 263 additions and 41 deletions.
diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
@@ -339,6 +339,18 @@ struct rank_helper
 };
 } // namespace detail
 
+// Artificial empty hierarchy to make it possible for the config type to be empty,
+// seems easier than checking everywhere in hierarchy APIs if its not empty.
+// Any usage of an empty hierarchy other than combine should lead to an error anyway
+struct __empty_hierarchy
+{
+  template <typename _Other>
+  _CCCL_NODISCARD _Other combine(const _Other& __other) const
+  {
+    return __other;
+  }
+};
+
 /**
  * @brief Type representing a hierarchy of CUDA threads
  *
@@ -731,7 +743,7 @@ public:
   //!
   //! @return Hierarchy holding the combined levels from both hierarchies
   template <typename OtherUnit, typename... OtherLevels>
-  constexpr auto combine(const hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>& other)
+  constexpr auto combine(const hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>& other) const
   {
     using this_top_level     = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
     using this_bottom_level  = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;
@@ -776,6 +788,13 @@ public:
       }
     }
   }
+
+#  ifndef _CCCL_DOXYGEN_INVOKED // Do not document
+  constexpr hierarchy_dimensions_fragment combine([[maybe_unused]] __empty_hierarchy __empty) const
+  {
+    return *this;
+  }
+#  endif // _CCCL_DOXYGEN_INVOKED
 };
 
 /**

diff --git a/cudax/include/cuda/experimental/__launch/configuration.cuh b/cudax/include/cuda/experimental/__launch/configuration.cuh
@@ -81,41 +81,15 @@ _CCCL_DEVICE auto& find_option_in_tuple(const ::cuda::std::tuple<Options...>& tu
   return ::cuda::std::apply(find_option_in_tuple_impl<Kind>(), tuple);
 }
 
+template <typename _Option, typename... _OptionsList>
+inline constexpr bool __option_present_in_list = ((_Option::kind == _OptionsList::kind) || ...);
+
 template <typename...>
 inline constexpr bool no_duplicate_options = true;
 
 template <typename Option, typename... Rest>
 inline constexpr bool no_duplicate_options<Option, Rest...> =
-  ((Option::kind != Rest::kind) && ...) && no_duplicate_options<Rest...>;
-
-template <typename... Prev>
-_CCCL_NODISCARD constexpr auto process_config_args(const ::cuda::std::tuple<Prev...>& previous)
-{
-  return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous));
-}
-
-template <typename... Prev, typename Arg, typename... Rest>
-_CCCL_NODISCARD constexpr auto
-process_config_args(const ::cuda::std::tuple<Prev...>& previous, const Arg& arg, const Rest&... rest)
-{
-  if constexpr (::cuda::std::is_base_of_v<detail::launch_option, Arg>)
-  {
-    static_assert((::cuda::std::is_base_of_v<detail::launch_option, Rest> && ...),
-                  "Hierarchy levels and launch options can't be mixed");
-    if constexpr (sizeof...(Prev) == 0)
-    {
-      return kernel_config(uninit_t{}, arg, rest...);
-    }
-    else
-    {
-      return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous), arg, rest...);
-    }
-  }
-  else
-  {
-    return process_config_args(::cuda::std::tuple_cat(previous, ::cuda::std::make_tuple(arg)), rest...);
-  }
-}
+  !__option_present_in_list<Option, Rest...> && no_duplicate_options<Rest...>;
 
 } // namespace detail
 
@@ -340,14 +314,51 @@ private:
   }
 };
 
+template <typename... _OptionsToFilter>
+struct __filter_options
+{
+  template <bool _Pred, typename _Option>
+  _CCCL_NODISCARD auto __option_or_empty(const _Option& __option)
+  {
+    if constexpr (_Pred)
+    {
+      return ::cuda::std::tuple(__option);
+    }
+    else
+    {
+      return ::cuda::std::tuple();
+    }
+  }
+
+  template <typename... _Options>
+  _CCCL_NODISCARD auto operator()(const _Options&... __options)
+  {
+    return ::cuda::std::tuple_cat(
+      __option_or_empty<!detail::__option_present_in_list<_Options, _OptionsToFilter...>>(__options)...);
+  }
+};
+
+template <typename _Dimensions, typename... _Options>
+auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts);
+
+template <typename _T>
+inline constexpr bool __is_kernel_config = false;
+
+template <typename _Dimensions, typename... _Options>
+inline constexpr bool __is_kernel_config<kernel_config<_Dimensions, _Options...>> = true;
+
+template <typename _Tp>
+_CCCL_CONCEPT __kernel_has_default_config =
+  _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)(requires(__is_kernel_config<decltype(__t.default_config())>));
+
 /**
  * @brief Type describing a kernel launch configuration
  *
  * This type should not be constructed directly and make_config helper function should be used instead
  *
  * @tparam Dimensions
- * cuda::experimetnal::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this configuration
- * object
+ * cuda::experimetnal::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this
+ * configuration object
  *
  * @tparam Options
  * Types of options that were added to this configuration object
@@ -358,7 +369,7 @@ struct kernel_config
   Dimensions dims;
   ::cuda::std::tuple<Options...> options;
 
-  static_assert(::cuda::std::_Or<std::true_type, ::cuda::std::is_base_of<detail::launch_option, Options>...>::value);
+  static_assert(::cuda::std::_And<::cuda::std::is_base_of<detail::launch_option, Options>...>::value);
   static_assert(detail::no_duplicate_options<Options...>);
 
   constexpr kernel_config(const Dimensions& dims, const Options&... opts)
@@ -383,6 +394,54 @@ struct kernel_config
     return kernel_config<Dimensions, Options..., NewOptions...>(
       dims, ::cuda::std::tuple_cat(options, ::cuda::std::make_tuple(new_options...)));
   }
+
+  /**
+   * @brief Combine this configuration with another configuration object
+   *
+   * Returns a new `kernel_config` that is a combination of this configuration and the configuration from argument.
+   * It contains dimensions that are combination of dimensions in this object and the other configuration. The resulting
+   * hierarchy holds levels present in both hierarchies. In case of overlap of levels hierarchy from this configuration
+   * is prioritized, so the result always holds all levels from this hierarchy and non-overlapping
+   * levels from the other hierarchy. This behavior is the same as `combine()` member function of the hierarchy type.
+   * The result also contains configuration options from both configurations. In case the same type of a configuration
+   * option is present in both configration this configuration is copied into the resulting configuration.
+   *
+   * @param __other_config
+   * Other configuration to combine with this configuration
+   */
+  template <typename _OtherDimensions, typename... _OtherOptions>
+  _CCCL_NODISCARD auto combine(const kernel_config<_OtherDimensions, _OtherOptions...>& __other_config) const
+  {
+    // can't use fully qualified kernel_config name here because of nvcc bug, TODO remove __make_config_from_tuple once
+    // fixed
+    return __make_config_from_tuple(
+      dims.combine(__other_config.dims),
+      ::cuda::std::tuple_cat(options, ::cuda::std::apply(__filter_options<Options...>{}, __other_config.options)));
+  }
+
+  /**
+   * @brief Combine this configuration with default configuration of a kernel functor
+   *
+   * Returns a new `kernel_config` that is a combination of this configuration and a default configuration from the
+   * kernel argument. Default configuration is a `kernel_config` object returned from `default_config()` member function
+   * of the kernel type. The configurations are combined using the `combine()` member function of this configuration.
+   * If the kernel has no default configuration, a copy of this configuration is returned without any changes.
+   *
+   * @param __kernel
+   * Kernel functor to search for the default configuration
+   */
+  template <typename _Kernel>
+  _CCCL_NODISCARD auto combine_with_default(const _Kernel& __kernel) const
+  {
+    if constexpr (__kernel_has_default_config<_Kernel>)
+    {
+      return combine(__kernel.default_config());
+    }
+    else
+    {
+      return *this;
+    }
+  }
 };
 
 // We can consider removing the operator&, but its convenient for in-line construction
@@ -407,6 +466,12 @@ operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims
   return kernel_config(make_hierarchy_fragment(l1, l2));
 }
 
+template <typename _Dimensions, typename... _Options>
+auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts)
+{
+  return kernel_config(__dims, __opts);
+}
+
 template <typename Dimensions,
           typename... Options,
           typename Option,
@@ -470,11 +535,46 @@ constexpr auto distribute(int numElements) noexcept
   return make_config(make_hierarchy(grid_dims(blocksPerGrid), block_dims<_ThreadsPerBlock>()));
 }
 
+template <typename... Prev>
+_CCCL_NODISCARD constexpr auto __process_config_args(const ::cuda::std::tuple<Prev...>& previous)
+{
+  if constexpr (sizeof...(Prev) == 0)
+  {
+    return kernel_config<__empty_hierarchy>(__empty_hierarchy());
+  }
+  else
+  {
+    return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous));
+  }
+}
+
+template <typename... Prev, typename Arg, typename... Rest>
+_CCCL_NODISCARD constexpr auto
+__process_config_args(const ::cuda::std::tuple<Prev...>& previous, const Arg& arg, const Rest&... rest)
+{
+  if constexpr (::cuda::std::is_base_of_v<detail::launch_option, Arg>)
+  {
+    static_assert((::cuda::std::is_base_of_v<detail::launch_option, Rest> && ...),
+                  "Hierarchy levels and launch options can't be mixed");
+    if constexpr (sizeof...(Prev) == 0)
+    {
+      return kernel_config(__empty_hierarchy(), arg, rest...);
+    }
+    else
+    {
+      return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous), arg, rest...);
+    }
+  }
+  else
+  {
+    return __process_config_args(::cuda::std::tuple_cat(previous, ::cuda::std::make_tuple(arg)), rest...);
+  }
+}
+
 template <typename... Args>
 _CCCL_NODISCARD constexpr auto make_config(const Args&... args)
 {
-  static_assert(sizeof...(Args) != 0, "Configuration can't be empty");
-  return detail::process_config_args(::cuda::std::make_tuple(), args...);
+  return __process_config_args(::cuda::std::make_tuple(), args...);
 }
 
 namespace detail

diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -125,14 +125,15 @@ void launch(
 {
   __ensure_current_device __dev_setter(stream);
   cudaError_t status;
+  auto combined = conf.combine_with_default(kernel);
   if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, as_kernel_arg_t<Args>...>)
   {
-    auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, as_kernel_arg_t<Args>...>;
+    auto launcher = detail::kernel_launcher<decltype(combined), Kernel, as_kernel_arg_t<Args>...>;
     status        = detail::launch_impl(
       stream,
-      conf,
+      combined,
       launcher,
-      conf,
+      combined,
       kernel,
       static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, std::forward<Args>(args)))...);
   }
@@ -142,7 +143,7 @@ void launch(
     auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
     status        = detail::launch_impl(
       stream,
-      conf,
+      combined,
       launcher,
       kernel,
       static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, std::forward<Args>(args)))...);

diff --git a/cudax/test/launch/configuration.cu b/cudax/test/launch/configuration.cu
@@ -201,5 +201,44 @@ TEST_CASE("Hierarchy construction in config", "[launch]")
   CUDAX_REQUIRE(config_no_options.dims.count(cudax::thread) == 256);
 
   [[maybe_unused]] auto config_no_dims = cudax::make_config(cudax::cooperative_launch());
-  static_assert(cuda::std::is_same_v<decltype(config_no_dims.dims), cudax::uninit_t>);
+  static_assert(cuda::std::is_same_v<decltype(config_no_dims.dims), cudax::__empty_hierarchy>);
+}
+
+TEST_CASE("Configuration combine", "[launch]")
+{
+  auto grid    = cudax::grid_dims<2>;
+  auto cluster = cudax::cluster_dims<2, 2>;
+  auto block   = cudax::block_dims(256);
+  SECTION("Combine with no overlap")
+  {
+    auto config_part1                         = make_config(grid);
+    auto config_part2                         = make_config(block, cudax::launch_priority(2));
+    auto combined                             = config_part1.combine(config_part2);
+    [[maybe_unused]] auto combined_other_way  = config_part2.combine(config_part1);
+    [[maybe_unused]] auto combined_with_empty = combined.combine(cudax::make_config());
+    [[maybe_unused]] auto empty_with_combined = cudax::make_config().combine(combined);
+    static_assert(
+      cuda::std::is_same_v<decltype(combined), decltype(make_config(grid, block, cudax::launch_priority(2)))>);
+    static_assert(cuda::std::is_same_v<decltype(combined), decltype(combined_other_way)>);
+    static_assert(cuda::std::is_same_v<decltype(combined), decltype(combined_with_empty)>);
+    static_assert(cuda::std::is_same_v<decltype(combined), decltype(empty_with_combined)>);
+    CUDAX_REQUIRE(combined.dims.count(cudax::thread) == 512);
+  }
+  SECTION("Combine with overlap")
+  {
+    auto config_part1 = make_config(grid, cluster, cudax::launch_priority(2));
+    auto config_part2 = make_config(cudax::cluster_dims<256>, block, cudax::launch_priority(42));
+    auto combined     = config_part1.combine(config_part2);
+    CUDAX_REQUIRE(combined.dims.count(cudax::thread) == 2048);
+    CUDAX_REQUIRE(cuda::std::get<0>(combined.options).priority == 2);
+
+    auto replaced_one_option = cudax::make_config(cudax::launch_priority(3)).combine(combined);
+    CUDAX_REQUIRE(replaced_one_option.dims.count(cudax::thread) == 2048);
+    CUDAX_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3);
+
+    [[maybe_unused]] auto combined_with_extra_option =
+      combined.combine(cudax::make_config(cudax::cooperative_launch()));
+    static_assert(cuda::std::is_same_v<decltype(combined.dims), decltype(combined_with_extra_option.dims)>);
+    static_assert(cuda::std::tuple_size_v<decltype(combined_with_extra_option.options)> == 2);
+  }
 }