diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh index dab60982e51..ebd73b33360 100644 --- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh @@ -339,6 +339,18 @@ struct rank_helper }; } // namespace detail +// Artificial empty hierarchy to make it possible for the config type to be empty, +// seems easier than checking everywhere in hierarchy APIs if its not empty. +// Any usage of an empty hierarchy other than combine should lead to an error anyway +struct __empty_hierarchy +{ + template + _CCCL_NODISCARD _Other combine(const _Other& __other) const + { + return __other; + } +}; + /** * @brief Type representing a hierarchy of CUDA threads * @@ -731,7 +743,7 @@ public: //! //! @return Hierarchy holding the combined levels from both hierarchies template - constexpr auto combine(const hierarchy_dimensions_fragment& other) + constexpr auto combine(const hierarchy_dimensions_fragment& other) const { using this_top_level = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>; using this_bottom_level = __level_type_of<::cuda::std::__type_index_c>; @@ -776,6 +788,13 @@ public: } } } + +# ifndef _CCCL_DOXYGEN_INVOKED // Do not document + constexpr hierarchy_dimensions_fragment combine([[maybe_unused]] __empty_hierarchy __empty) const + { + return *this; + } +# endif // _CCCL_DOXYGEN_INVOKED }; /** diff --git a/cudax/include/cuda/experimental/__launch/configuration.cuh b/cudax/include/cuda/experimental/__launch/configuration.cuh index 2a1b3f2cb0a..546c8a228a4 100644 --- a/cudax/include/cuda/experimental/__launch/configuration.cuh +++ b/cudax/include/cuda/experimental/__launch/configuration.cuh @@ -81,41 +81,15 @@ _CCCL_DEVICE auto& find_option_in_tuple(const ::cuda::std::tuple& tu return ::cuda::std::apply(find_option_in_tuple_impl(), tuple); } +template +inline constexpr bool __option_present_in_list = ((_Option::kind == _OptionsList::kind) || ...); + template inline constexpr bool no_duplicate_options = true; template inline constexpr bool no_duplicate_options = - ((Option::kind != Rest::kind) && ...) && no_duplicate_options; - -template -_CCCL_NODISCARD constexpr auto process_config_args(const ::cuda::std::tuple& previous) -{ - return kernel_config(::cuda::std::apply(make_hierarchy_fragment, previous)); -} - -template -_CCCL_NODISCARD constexpr auto -process_config_args(const ::cuda::std::tuple& previous, const Arg& arg, const Rest&... rest) -{ - if constexpr (::cuda::std::is_base_of_v) - { - static_assert((::cuda::std::is_base_of_v && ...), - "Hierarchy levels and launch options can't be mixed"); - if constexpr (sizeof...(Prev) == 0) - { - return kernel_config(uninit_t{}, arg, rest...); - } - else - { - return kernel_config(::cuda::std::apply(make_hierarchy_fragment, previous), arg, rest...); - } - } - else - { - return process_config_args(::cuda::std::tuple_cat(previous, ::cuda::std::make_tuple(arg)), rest...); - } -} + !__option_present_in_list && no_duplicate_options; } // namespace detail @@ -340,14 +314,51 @@ private: } }; +template +struct __filter_options +{ + template + _CCCL_NODISCARD auto __option_or_empty(const _Option& __option) + { + if constexpr (_Pred) + { + return ::cuda::std::tuple(__option); + } + else + { + return ::cuda::std::tuple(); + } + } + + template + _CCCL_NODISCARD auto operator()(const _Options&... __options) + { + return ::cuda::std::tuple_cat( + __option_or_empty>(__options)...); + } +}; + +template +auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts); + +template +inline constexpr bool __is_kernel_config = false; + +template +inline constexpr bool __is_kernel_config> = true; + +template +_CCCL_CONCEPT __kernel_has_default_config = + _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)(requires(__is_kernel_config)); + /** * @brief Type describing a kernel launch configuration * * This type should not be constructed directly and make_config helper function should be used instead * * @tparam Dimensions - * cuda::experimetnal::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this configuration - * object + * cuda::experimetnal::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this + * configuration object * * @tparam Options * Types of options that were added to this configuration object @@ -358,7 +369,7 @@ struct kernel_config Dimensions dims; ::cuda::std::tuple options; - static_assert(::cuda::std::_Or...>::value); + static_assert(::cuda::std::_And<::cuda::std::is_base_of...>::value); static_assert(detail::no_duplicate_options); constexpr kernel_config(const Dimensions& dims, const Options&... opts) @@ -383,6 +394,54 @@ struct kernel_config return kernel_config( dims, ::cuda::std::tuple_cat(options, ::cuda::std::make_tuple(new_options...))); } + + /** + * @brief Combine this configuration with another configuration object + * + * Returns a new `kernel_config` that is a combination of this configuration and the configuration from argument. + * It contains dimensions that are combination of dimensions in this object and the other configuration. The resulting + * hierarchy holds levels present in both hierarchies. In case of overlap of levels hierarchy from this configuration + * is prioritized, so the result always holds all levels from this hierarchy and non-overlapping + * levels from the other hierarchy. This behavior is the same as `combine()` member function of the hierarchy type. + * The result also contains configuration options from both configurations. In case the same type of a configuration + * option is present in both configration this configuration is copied into the resulting configuration. + * + * @param __other_config + * Other configuration to combine with this configuration + */ + template + _CCCL_NODISCARD auto combine(const kernel_config<_OtherDimensions, _OtherOptions...>& __other_config) const + { + // can't use fully qualified kernel_config name here because of nvcc bug, TODO remove __make_config_from_tuple once + // fixed + return __make_config_from_tuple( + dims.combine(__other_config.dims), + ::cuda::std::tuple_cat(options, ::cuda::std::apply(__filter_options{}, __other_config.options))); + } + + /** + * @brief Combine this configuration with default configuration of a kernel functor + * + * Returns a new `kernel_config` that is a combination of this configuration and a default configuration from the + * kernel argument. Default configuration is a `kernel_config` object returned from `default_config()` member function + * of the kernel type. The configurations are combined using the `combine()` member function of this configuration. + * If the kernel has no default configuration, a copy of this configuration is returned without any changes. + * + * @param __kernel + * Kernel functor to search for the default configuration + */ + template + _CCCL_NODISCARD auto combine_with_default(const _Kernel& __kernel) const + { + if constexpr (__kernel_has_default_config<_Kernel>) + { + return combine(__kernel.default_config()); + } + else + { + return *this; + } + } }; // We can consider removing the operator&, but its convenient for in-line construction @@ -407,6 +466,12 @@ operator&(const level_dimensions& l1, const level_dimensions +auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts) +{ + return kernel_config(__dims, __opts); +} + template ())); } +template +_CCCL_NODISCARD constexpr auto __process_config_args(const ::cuda::std::tuple& previous) +{ + if constexpr (sizeof...(Prev) == 0) + { + return kernel_config<__empty_hierarchy>(__empty_hierarchy()); + } + else + { + return kernel_config(::cuda::std::apply(make_hierarchy_fragment, previous)); + } +} + +template +_CCCL_NODISCARD constexpr auto +__process_config_args(const ::cuda::std::tuple& previous, const Arg& arg, const Rest&... rest) +{ + if constexpr (::cuda::std::is_base_of_v) + { + static_assert((::cuda::std::is_base_of_v && ...), + "Hierarchy levels and launch options can't be mixed"); + if constexpr (sizeof...(Prev) == 0) + { + return kernel_config(__empty_hierarchy(), arg, rest...); + } + else + { + return kernel_config(::cuda::std::apply(make_hierarchy_fragment, previous), arg, rest...); + } + } + else + { + return __process_config_args(::cuda::std::tuple_cat(previous, ::cuda::std::make_tuple(arg)), rest...); + } +} + template _CCCL_NODISCARD constexpr auto make_config(const Args&... args) { - static_assert(sizeof...(Args) != 0, "Configuration can't be empty"); - return detail::process_config_args(::cuda::std::make_tuple(), args...); + return __process_config_args(::cuda::std::make_tuple(), args...); } namespace detail diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh index 689645a7919..612db35000d 100644 --- a/cudax/include/cuda/experimental/__launch/launch.cuh +++ b/cudax/include/cuda/experimental/__launch/launch.cuh @@ -125,14 +125,15 @@ void launch( { __ensure_current_device __dev_setter(stream); cudaError_t status; + auto combined = conf.combine_with_default(kernel); if constexpr (::cuda::std::is_invocable_v, as_kernel_arg_t...>) { - auto launcher = detail::kernel_launcher, Kernel, as_kernel_arg_t...>; + auto launcher = detail::kernel_launcher...>; status = detail::launch_impl( stream, - conf, + combined, launcher, - conf, + combined, kernel, static_cast>(detail::__launch_transform(stream, std::forward(args)))...); } @@ -142,7 +143,7 @@ void launch( auto launcher = detail::kernel_launcher_no_config...>; status = detail::launch_impl( stream, - conf, + combined, launcher, kernel, static_cast>(detail::__launch_transform(stream, std::forward(args)))...); diff --git a/cudax/test/launch/configuration.cu b/cudax/test/launch/configuration.cu index 61a0288bfc4..b69d3aa2527 100644 --- a/cudax/test/launch/configuration.cu +++ b/cudax/test/launch/configuration.cu @@ -201,5 +201,44 @@ TEST_CASE("Hierarchy construction in config", "[launch]") CUDAX_REQUIRE(config_no_options.dims.count(cudax::thread) == 256); [[maybe_unused]] auto config_no_dims = cudax::make_config(cudax::cooperative_launch()); - static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); +} + +TEST_CASE("Configuration combine", "[launch]") +{ + auto grid = cudax::grid_dims<2>; + auto cluster = cudax::cluster_dims<2, 2>; + auto block = cudax::block_dims(256); + SECTION("Combine with no overlap") + { + auto config_part1 = make_config(grid); + auto config_part2 = make_config(block, cudax::launch_priority(2)); + auto combined = config_part1.combine(config_part2); + [[maybe_unused]] auto combined_other_way = config_part2.combine(config_part1); + [[maybe_unused]] auto combined_with_empty = combined.combine(cudax::make_config()); + [[maybe_unused]] auto empty_with_combined = cudax::make_config().combine(combined); + static_assert( + cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + CUDAX_REQUIRE(combined.dims.count(cudax::thread) == 512); + } + SECTION("Combine with overlap") + { + auto config_part1 = make_config(grid, cluster, cudax::launch_priority(2)); + auto config_part2 = make_config(cudax::cluster_dims<256>, block, cudax::launch_priority(42)); + auto combined = config_part1.combine(config_part2); + CUDAX_REQUIRE(combined.dims.count(cudax::thread) == 2048); + CUDAX_REQUIRE(cuda::std::get<0>(combined.options).priority == 2); + + auto replaced_one_option = cudax::make_config(cudax::launch_priority(3)).combine(combined); + CUDAX_REQUIRE(replaced_one_option.dims.count(cudax::thread) == 2048); + CUDAX_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3); + + [[maybe_unused]] auto combined_with_extra_option = + combined.combine(cudax::make_config(cudax::cooperative_launch())); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::tuple_size_v == 2); + } } diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index eef75c0cad4..fb19321b772 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -10,7 +10,9 @@ #include #include +#include +#include #include __managed__ bool kernel_run_proof = false; @@ -247,3 +249,64 @@ TEST_CASE("Smoke", "[launch]") { launch_smoke_test(); } + +template +struct kernel_with_default_config +{ + DefaultConfig config; + + kernel_with_default_config(DefaultConfig c) + : config(c) + {} + + DefaultConfig default_config() const + { + return config; + } + + template + __device__ void operator()(Config config, ConfigCheckFn check_fn) + { + check_fn(config); + } +}; + +void test_default_config() +{ + cudax::stream stream; + auto grid = cudax::grid_dims(4); + auto block = cudax::block_dims<256>; + + auto verify_lambda = [] __device__(auto config) { + static_assert(config.dims.count(cudax::thread, cudax::block) == 256); + CUDAX_REQUIRE(config.dims.count(cudax::block) == 4); + cooperative_groups::this_grid().sync(); + }; + + SECTION("Combine with empty") + { + kernel_with_default_config kernel{cudax::make_config(block, grid, cudax::cooperative_launch())}; + static_assert(cudax::__is_kernel_config); + static_assert(cudax::__kernel_has_default_config); + + cudax::launch(stream, cudax::make_config(), kernel, verify_lambda); + stream.wait(); + } + SECTION("Combine with no overlap") + { + kernel_with_default_config kernel{cudax::make_config(block)}; + cudax::launch(stream, cudax::make_config(grid, cudax::cooperative_launch()), kernel, verify_lambda); + stream.wait(); + } + SECTION("Combine with overlap") + { + kernel_with_default_config kernel{cudax::make_config(cudax::block_dims<1>, cudax::cooperative_launch())}; + cudax::launch(stream, cudax::make_config(block, grid, cudax::cooperative_launch()), kernel, verify_lambda); + stream.wait(); + } +} + +TEST_CASE("Launch with default config") +{ + test_default_config(); +}