Skip to content

Commit

Permalink
[CUDAX] Add combine API to kernel_config and allow adding default con…
Browse files Browse the repository at this point in the history
…figuration to kernel functors (#3082)

* WIP

* Default kernel configuration

* Default kernel config

* Docs and one more test case

* Fix clang and format

* Make MSVC happy
  • Loading branch information
pciolkosz authored Dec 11, 2024
1 parent 6e8bfc7 commit c6cc227
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,18 @@ struct rank_helper
};
} // namespace detail

// Artificial empty hierarchy to make it possible for the config type to be empty,
// seems easier than checking everywhere in hierarchy APIs if its not empty.
// Any usage of an empty hierarchy other than combine should lead to an error anyway
struct __empty_hierarchy
{
template <typename _Other>
_CCCL_NODISCARD _Other combine(const _Other& __other) const
{
return __other;
}
};

/**
* @brief Type representing a hierarchy of CUDA threads
*
Expand Down Expand Up @@ -731,7 +743,7 @@ public:
//!
//! @return Hierarchy holding the combined levels from both hierarchies
template <typename OtherUnit, typename... OtherLevels>
constexpr auto combine(const hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>& other)
constexpr auto combine(const hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>& other) const
{
using this_top_level = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
using this_bottom_level = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;
Expand Down Expand Up @@ -776,6 +788,13 @@ public:
}
}
}

# ifndef _CCCL_DOXYGEN_INVOKED // Do not document
constexpr hierarchy_dimensions_fragment combine([[maybe_unused]] __empty_hierarchy __empty) const
{
return *this;
}
# endif // _CCCL_DOXYGEN_INVOKED
};

/**
Expand Down
170 changes: 135 additions & 35 deletions cudax/include/cuda/experimental/__launch/configuration.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -81,41 +81,15 @@ _CCCL_DEVICE auto& find_option_in_tuple(const ::cuda::std::tuple<Options...>& tu
return ::cuda::std::apply(find_option_in_tuple_impl<Kind>(), tuple);
}

template <typename _Option, typename... _OptionsList>
inline constexpr bool __option_present_in_list = ((_Option::kind == _OptionsList::kind) || ...);

template <typename...>
inline constexpr bool no_duplicate_options = true;

template <typename Option, typename... Rest>
inline constexpr bool no_duplicate_options<Option, Rest...> =
((Option::kind != Rest::kind) && ...) && no_duplicate_options<Rest...>;

template <typename... Prev>
_CCCL_NODISCARD constexpr auto process_config_args(const ::cuda::std::tuple<Prev...>& previous)
{
return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous));
}

template <typename... Prev, typename Arg, typename... Rest>
_CCCL_NODISCARD constexpr auto
process_config_args(const ::cuda::std::tuple<Prev...>& previous, const Arg& arg, const Rest&... rest)
{
if constexpr (::cuda::std::is_base_of_v<detail::launch_option, Arg>)
{
static_assert((::cuda::std::is_base_of_v<detail::launch_option, Rest> && ...),
"Hierarchy levels and launch options can't be mixed");
if constexpr (sizeof...(Prev) == 0)
{
return kernel_config(uninit_t{}, arg, rest...);
}
else
{
return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous), arg, rest...);
}
}
else
{
return process_config_args(::cuda::std::tuple_cat(previous, ::cuda::std::make_tuple(arg)), rest...);
}
}
!__option_present_in_list<Option, Rest...> && no_duplicate_options<Rest...>;

} // namespace detail

Expand Down Expand Up @@ -340,14 +314,51 @@ private:
}
};

template <typename... _OptionsToFilter>
struct __filter_options
{
template <bool _Pred, typename _Option>
_CCCL_NODISCARD auto __option_or_empty(const _Option& __option)
{
if constexpr (_Pred)
{
return ::cuda::std::tuple(__option);
}
else
{
return ::cuda::std::tuple();
}
}

template <typename... _Options>
_CCCL_NODISCARD auto operator()(const _Options&... __options)
{
return ::cuda::std::tuple_cat(
__option_or_empty<!detail::__option_present_in_list<_Options, _OptionsToFilter...>>(__options)...);
}
};

template <typename _Dimensions, typename... _Options>
auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts);

template <typename _T>
inline constexpr bool __is_kernel_config = false;

template <typename _Dimensions, typename... _Options>
inline constexpr bool __is_kernel_config<kernel_config<_Dimensions, _Options...>> = true;

template <typename _Tp>
_CCCL_CONCEPT __kernel_has_default_config =
_CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)(requires(__is_kernel_config<decltype(__t.default_config())>));

/**
* @brief Type describing a kernel launch configuration
*
* This type should not be constructed directly and make_config helper function should be used instead
*
* @tparam Dimensions
* cuda::experimetnal::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this configuration
* object
* cuda::experimetnal::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this
* configuration object
*
* @tparam Options
* Types of options that were added to this configuration object
Expand All @@ -358,7 +369,7 @@ struct kernel_config
Dimensions dims;
::cuda::std::tuple<Options...> options;

static_assert(::cuda::std::_Or<std::true_type, ::cuda::std::is_base_of<detail::launch_option, Options>...>::value);
static_assert(::cuda::std::_And<::cuda::std::is_base_of<detail::launch_option, Options>...>::value);
static_assert(detail::no_duplicate_options<Options...>);

constexpr kernel_config(const Dimensions& dims, const Options&... opts)
Expand All @@ -383,6 +394,54 @@ struct kernel_config
return kernel_config<Dimensions, Options..., NewOptions...>(
dims, ::cuda::std::tuple_cat(options, ::cuda::std::make_tuple(new_options...)));
}

/**
* @brief Combine this configuration with another configuration object
*
* Returns a new `kernel_config` that is a combination of this configuration and the configuration from argument.
* It contains dimensions that are combination of dimensions in this object and the other configuration. The resulting
* hierarchy holds levels present in both hierarchies. In case of overlap of levels hierarchy from this configuration
* is prioritized, so the result always holds all levels from this hierarchy and non-overlapping
* levels from the other hierarchy. This behavior is the same as `combine()` member function of the hierarchy type.
* The result also contains configuration options from both configurations. In case the same type of a configuration
* option is present in both configration this configuration is copied into the resulting configuration.
*
* @param __other_config
* Other configuration to combine with this configuration
*/
template <typename _OtherDimensions, typename... _OtherOptions>
_CCCL_NODISCARD auto combine(const kernel_config<_OtherDimensions, _OtherOptions...>& __other_config) const
{
// can't use fully qualified kernel_config name here because of nvcc bug, TODO remove __make_config_from_tuple once
// fixed
return __make_config_from_tuple(
dims.combine(__other_config.dims),
::cuda::std::tuple_cat(options, ::cuda::std::apply(__filter_options<Options...>{}, __other_config.options)));
}

/**
* @brief Combine this configuration with default configuration of a kernel functor
*
* Returns a new `kernel_config` that is a combination of this configuration and a default configuration from the
* kernel argument. Default configuration is a `kernel_config` object returned from `default_config()` member function
* of the kernel type. The configurations are combined using the `combine()` member function of this configuration.
* If the kernel has no default configuration, a copy of this configuration is returned without any changes.
*
* @param __kernel
* Kernel functor to search for the default configuration
*/
template <typename _Kernel>
_CCCL_NODISCARD auto combine_with_default(const _Kernel& __kernel) const
{
if constexpr (__kernel_has_default_config<_Kernel>)
{
return combine(__kernel.default_config());
}
else
{
return *this;
}
}
};

// We can consider removing the operator&, but its convenient for in-line construction
Expand All @@ -407,6 +466,12 @@ operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims
return kernel_config(make_hierarchy_fragment(l1, l2));
}

template <typename _Dimensions, typename... _Options>
auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts)
{
return kernel_config(__dims, __opts);
}

template <typename Dimensions,
typename... Options,
typename Option,
Expand Down Expand Up @@ -470,11 +535,46 @@ constexpr auto distribute(int numElements) noexcept
return make_config(make_hierarchy(grid_dims(blocksPerGrid), block_dims<_ThreadsPerBlock>()));
}

template <typename... Prev>
_CCCL_NODISCARD constexpr auto __process_config_args(const ::cuda::std::tuple<Prev...>& previous)
{
if constexpr (sizeof...(Prev) == 0)
{
return kernel_config<__empty_hierarchy>(__empty_hierarchy());
}
else
{
return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous));
}
}

template <typename... Prev, typename Arg, typename... Rest>
_CCCL_NODISCARD constexpr auto
__process_config_args(const ::cuda::std::tuple<Prev...>& previous, const Arg& arg, const Rest&... rest)
{
if constexpr (::cuda::std::is_base_of_v<detail::launch_option, Arg>)
{
static_assert((::cuda::std::is_base_of_v<detail::launch_option, Rest> && ...),
"Hierarchy levels and launch options can't be mixed");
if constexpr (sizeof...(Prev) == 0)
{
return kernel_config(__empty_hierarchy(), arg, rest...);
}
else
{
return kernel_config(::cuda::std::apply(make_hierarchy_fragment<void, const Prev&...>, previous), arg, rest...);
}
}
else
{
return __process_config_args(::cuda::std::tuple_cat(previous, ::cuda::std::make_tuple(arg)), rest...);
}
}

template <typename... Args>
_CCCL_NODISCARD constexpr auto make_config(const Args&... args)
{
static_assert(sizeof...(Args) != 0, "Configuration can't be empty");
return detail::process_config_args(::cuda::std::make_tuple(), args...);
return __process_config_args(::cuda::std::make_tuple(), args...);
}

namespace detail
Expand Down
9 changes: 5 additions & 4 deletions cudax/include/cuda/experimental/__launch/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,15 @@ void launch(
{
__ensure_current_device __dev_setter(stream);
cudaError_t status;
auto combined = conf.combine_with_default(kernel);
if constexpr (::cuda::std::is_invocable_v<Kernel, kernel_config<Dimensions, Config...>, as_kernel_arg_t<Args>...>)
{
auto launcher = detail::kernel_launcher<kernel_config<Dimensions, Config...>, Kernel, as_kernel_arg_t<Args>...>;
auto launcher = detail::kernel_launcher<decltype(combined), Kernel, as_kernel_arg_t<Args>...>;
status = detail::launch_impl(
stream,
conf,
combined,
launcher,
conf,
combined,
kernel,
static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, std::forward<Args>(args)))...);
}
Expand All @@ -142,7 +143,7 @@ void launch(
auto launcher = detail::kernel_launcher_no_config<Kernel, as_kernel_arg_t<Args>...>;
status = detail::launch_impl(
stream,
conf,
combined,
launcher,
kernel,
static_cast<as_kernel_arg_t<Args>>(detail::__launch_transform(stream, std::forward<Args>(args)))...);
Expand Down
41 changes: 40 additions & 1 deletion cudax/test/launch/configuration.cu
Original file line number Diff line number Diff line change
Expand Up @@ -201,5 +201,44 @@ TEST_CASE("Hierarchy construction in config", "[launch]")
CUDAX_REQUIRE(config_no_options.dims.count(cudax::thread) == 256);

[[maybe_unused]] auto config_no_dims = cudax::make_config(cudax::cooperative_launch());
static_assert(cuda::std::is_same_v<decltype(config_no_dims.dims), cudax::uninit_t>);
static_assert(cuda::std::is_same_v<decltype(config_no_dims.dims), cudax::__empty_hierarchy>);
}

TEST_CASE("Configuration combine", "[launch]")
{
auto grid = cudax::grid_dims<2>;
auto cluster = cudax::cluster_dims<2, 2>;
auto block = cudax::block_dims(256);
SECTION("Combine with no overlap")
{
auto config_part1 = make_config(grid);
auto config_part2 = make_config(block, cudax::launch_priority(2));
auto combined = config_part1.combine(config_part2);
[[maybe_unused]] auto combined_other_way = config_part2.combine(config_part1);
[[maybe_unused]] auto combined_with_empty = combined.combine(cudax::make_config());
[[maybe_unused]] auto empty_with_combined = cudax::make_config().combine(combined);
static_assert(
cuda::std::is_same_v<decltype(combined), decltype(make_config(grid, block, cudax::launch_priority(2)))>);
static_assert(cuda::std::is_same_v<decltype(combined), decltype(combined_other_way)>);
static_assert(cuda::std::is_same_v<decltype(combined), decltype(combined_with_empty)>);
static_assert(cuda::std::is_same_v<decltype(combined), decltype(empty_with_combined)>);
CUDAX_REQUIRE(combined.dims.count(cudax::thread) == 512);
}
SECTION("Combine with overlap")
{
auto config_part1 = make_config(grid, cluster, cudax::launch_priority(2));
auto config_part2 = make_config(cudax::cluster_dims<256>, block, cudax::launch_priority(42));
auto combined = config_part1.combine(config_part2);
CUDAX_REQUIRE(combined.dims.count(cudax::thread) == 2048);
CUDAX_REQUIRE(cuda::std::get<0>(combined.options).priority == 2);

auto replaced_one_option = cudax::make_config(cudax::launch_priority(3)).combine(combined);
CUDAX_REQUIRE(replaced_one_option.dims.count(cudax::thread) == 2048);
CUDAX_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3);

[[maybe_unused]] auto combined_with_extra_option =
combined.combine(cudax::make_config(cudax::cooperative_launch()));
static_assert(cuda::std::is_same_v<decltype(combined.dims), decltype(combined_with_extra_option.dims)>);
static_assert(cuda::std::tuple_size_v<decltype(combined_with_extra_option.options)> == 2);
}
}
Loading

0 comments on commit c6cc227

Please sign in to comment.