From 6bb30c623ba52ad90ac16e710e2e3271b14bd450 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sun, 26 Jan 2025 11:36:05 -0800 Subject: [PATCH] adds benchmarks for merge keys --- cub/benchmarks/bench/merge/keys.cu | 165 ++++++++++++++++++++ cub/benchmarks/bench/merge/merge_common.cuh | 63 ++++++++ cub/benchmarks/bench/merge/pairs.cu | 107 +++---------- 3 files changed, 249 insertions(+), 86 deletions(-) create mode 100644 cub/benchmarks/bench/merge/keys.cu create mode 100644 cub/benchmarks/bench/merge/merge_common.cuh diff --git a/cub/benchmarks/bench/merge/keys.cu b/cub/benchmarks/bench/merge/keys.cu new file mode 100644 index 00000000000..e8d9bd7a5a4 --- /dev/null +++ b/cub/benchmarks/bench/merge/keys.cu @@ -0,0 +1,165 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include "merge_common.cuh" +#include + +// %RANGE% TUNE_TRANSPOSE trp 0:1:1 +// %RANGE% TUNE_LOAD ld 0:2:1 +// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 +// %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1 + +template +void keys(nvbench::state& state, nvbench::type_list) +{ + using key_t = KeyT; + using value_t = cub::NullType; + using key_input_it_t = key_t*; + using value_input_it_t = value_t*; + using key_it_t = key_t*; + using value_it_t = value_t*; + using offset_t = OffsetT; + using compare_op_t = less_t; + +#if !TUNE_BASE + using policy_t = policy_hub_t; + using dispatch_t = cub:: + DispatchMergeSort; +#else // TUNE_BASE + using dispatch_t = cub::detail::merge:: + dispatch_t; +#endif // TUNE_BASE + + // Retrieve axis parameters + const auto elements = static_cast(state.get_int64("Elements{io}")); + const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); + + // We generate data distributions in the range [0, 255] that, with lower entropy, get skewed towards 0. + // We use this to generate increasingly large *consecutive* segments of data that are getting selected from the lhs + thrust::device_vector rnd_selector_val = generate(elements, entropy); + uint8_t threshold = 128; + select_if_less_than_t select_lhs_op{false, threshold}; + select_if_less_than_t select_rhs_op{true, threshold}; + + // The following algorithm only works under the precondition that there's at least 50% of the data in the lhs + // If that's not the case, we simply swap the logic for selecting into lhs and rhs + const auto num_items_selected_into_lhs = + static_cast(thrust::count_if(rnd_selector_val.begin(), rnd_selector_val.end(), select_lhs_op)); + if (num_items_selected_into_lhs < elements / 2) + { + using ::cuda::std::swap; + swap(select_lhs_op, select_rhs_op); + } + + // We want lhs and rhs to be of equal size. We also want to have skewed distributions, such that we put different + // workloads on the binary search part. For this reason, we identify the index from the input, referred to as pivot + // point, after which the lhs is "full". We compose the rhs by selecting all items up to the pivot point that were not + // selected for lhs and *all* items after the pivot point. + constexpr std::size_t num_pivot_points = 1; + thrust::device_vector pivot_point(num_pivot_points); + const auto num_items_lhs = elements / 2; + const auto num_items_rhs = elements - num_items_lhs; + auto counting_it = thrust::make_counting_iterator(offset_t{0}); + thrust::copy_if( + counting_it, + counting_it + elements, + rnd_selector_val.begin(), + thrust::make_tabulate_output_iterator(write_pivot_point_t{ + static_cast(num_items_lhs), thrust::raw_pointer_cast(pivot_point.data())}), + select_lhs_op); + + thrust::device_vector keys_lhs(num_items_lhs); + thrust::device_vector keys_rhs(num_items_rhs); + thrust::device_vector keys_out(elements); + + // Generate increasing input range to sample from + thrust::device_vector increasing_input = generate(elements); + thrust::sort(increasing_input.begin(), increasing_input.end()); + + // Select lhs from input up to pivot point + offset_t pivot_point_val = pivot_point[0]; + auto const end_lhs = thrust::copy_if( + increasing_input.cbegin(), + increasing_input.cbegin() + pivot_point_val, + rnd_selector_val.cbegin(), + keys_lhs.begin(), + select_lhs_op); + + // Select rhs items from input up to pivot point + auto const end_rhs = thrust::copy_if( + increasing_input.cbegin(), + increasing_input.cbegin() + pivot_point_val, + rnd_selector_val.cbegin(), + keys_rhs.begin(), + select_rhs_op); + // From pivot point copy all remaining items to rhs + thrust::copy(increasing_input.cbegin() + pivot_point_val, increasing_input.cbegin() + elements, end_rhs); + + key_t* d_keys_lhs = thrust::raw_pointer_cast(keys_lhs.data()); + key_t* d_keys_rhs = thrust::raw_pointer_cast(keys_rhs.data()); + key_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data()); + + // Enable throughput calculations and add "Size" column to results. + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(elements); + + // Allocate temporary storage: + std::size_t temp_size{}; + dispatch_t::dispatch( + nullptr, + temp_size, + d_keys_lhs, + nullptr, + num_items_lhs, + d_keys_rhs, + nullptr, + num_items_rhs, + d_keys_out, + nullptr, + compare_op_t{}, + cudaStream_t{}); + + thrust::device_vector temp(temp_size); + auto* temp_storage = thrust::raw_pointer_cast(temp.data()); + + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { + dispatch_t::dispatch( + temp_storage, + temp_size, + d_keys_lhs, + nullptr, + num_items_lhs, + d_keys_rhs, + nullptr, + num_items_rhs, + d_keys_out, + nullptr, + compare_op_t{}, + launch.get_stream()); + }); +} + +#ifdef TUNE_KeyT +using key_types = nvbench::type_list; +#else // !defined(TUNE_KeyT) +using key_types = fundamental_types; +#endif // TUNE_KeyT + +NVBENCH_BENCH_TYPES(keys, NVBENCH_TYPE_AXES(key_types, offset_types)) + .set_name("base") + .set_type_axes_names({"KeyT{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/cub/benchmarks/bench/merge/merge_common.cuh b/cub/benchmarks/bench/merge/merge_common.cuh new file mode 100644 index 00000000000..6afbc587e6e --- /dev/null +++ b/cub/benchmarks/bench/merge/merge_common.cuh @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#if !TUNE_BASE +# define TUNE_THREADS_PER_BLOCK (1 << TUNE_THREADS_PER_BLOCK_POW2) +# if TUNE_TRANSPOSE == 0 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT +# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT +# else // TUNE_TRANSPOSE == 1 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE +# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE +# endif // TUNE_TRANSPOSE + +# if TUNE_LOAD == 0 +# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT +# elif TUNE_LOAD == 1 +# define TUNE_LOAD_MODIFIER cub::LOAD_LDG +# else // TUNE_LOAD == 2 +# define TUNE_LOAD_MODIFIER cub::LOAD_CA +# endif // TUNE_LOAD + +template +struct policy_hub_t +{ + struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + { + using merge_policy = + cub::agent_policy_t(TUNE_ITEMS_PER_THREAD), + TUNE_LOAD_ALGORITHM, + TUNE_LOAD_MODIFIER, + TUNE_STORE_ALGORITHM>; + }; + + using MaxPolicy = policy_t; +}; +#endif // TUNE_BASE + +struct select_if_less_than_t +{ + bool negate; + uint8_t threshold; + + __device__ __forceinline__ bool operator()(uint8_t val) const + { + return negate ? !(val < threshold) : val < threshold; + } +}; + +template +struct write_pivot_point_t +{ + OffsetT threshold; + OffsetT* pivot_point; + + __device__ void operator()(OffsetT output_index, OffsetT input_index) const + { + if (output_index == threshold) + { + *pivot_point = input_index; + } + } +}; diff --git a/cub/benchmarks/bench/merge/pairs.cu b/cub/benchmarks/bench/merge/pairs.cu index 277dd526006..a2ddcfd7a9b 100644 --- a/cub/benchmarks/bench/merge/pairs.cu +++ b/cub/benchmarks/bench/merge/pairs.cu @@ -5,94 +5,22 @@ #include #include +#include #include +#include #include #include -#include "thrust/detail/raw_pointer_cast.h" +#include "merge_common.cuh" #include -template -struct to_key_t -{ - template - __host__ __device__ __forceinline__ T operator()(IndexT index) const - { - return static_cast(index); - } -}; - -struct select_if_less_than_t -{ - bool negate; - uint8_t threshold; - - __host__ __device__ __forceinline__ bool operator()(uint8_t val) const - { - return negate ? !(val < threshold) : val < threshold; - } -}; - -template -struct write_pivot_point_t -{ - OffsetT threshold; - OffsetT* pivot_point; - - __host__ __device__ __forceinline__ void operator()(OffsetT output_index, OffsetT input_index) const - { - if (output_index == threshold) - { - *pivot_point = input_index; - } - } -}; - // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1 -#ifndef TUNE_BASE -# define TUNE_THREADS_PER_BLOCK (1 << TUNE_THREADS_PER_BLOCK_POW2) -#endif - -#if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# elif TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_LDG -# else // TUNE_LOAD == 2 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - -template -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using merge_policy = - cub::agent_policy_t(TUNE_ITEMS_PER_THREAD), - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - TUNE_STORE_ALGORITHM>; - }; - - using MaxPolicy = policy_t; -}; -#endif // TUNE_BASE - template void pairs(nvbench::state& state, nvbench::type_list) { @@ -118,7 +46,7 @@ void pairs(nvbench::state& state, nvbench::type_list) const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); - // We generate data distributions in the range [0, 255] that, with lower entropy, get skewed towards 0 + // We generate data distributions in the range [0, 255] that, with lower entropy, get skewed towards 0. // We use this to generate increasingly large *consecutive* segments of data that are getting selected from the lhs thrust::device_vector rnd_selector_val = generate(elements, entropy); uint8_t threshold = 128; @@ -137,8 +65,8 @@ void pairs(nvbench::state& state, nvbench::type_list) // We want lhs and rhs to be of equal size. We also want to have skewed distributions, such that we put different // workloads on the binary search part. For this reason, we identify the index from the input, referred to as pivot - // point, after which the lhs is "full". We compose the rhs by selecting all unselected items up to the pivot point - // and *all* items after the pivot point. + // point, after which the lhs is "full". We compose the rhs by selecting all items up to the pivot point that were not + // selected for lhs and *all* items after the pivot point. constexpr std::size_t num_pivot_points = 1; thrust::device_vector pivot_point(num_pivot_points); const auto num_items_lhs = elements / 2; @@ -159,18 +87,28 @@ void pairs(nvbench::state& state, nvbench::type_list) thrust::device_vector values_rhs(num_items_rhs); thrust::device_vector values_out(elements); - // Fancy iterator to generate key_t in strictly increasing order - auto data_gen_it = thrust::make_transform_iterator(counting_it, to_key_t{}); + // Generate increasing input range to sample from + thrust::device_vector increasing_input = generate(elements); + thrust::sort(increasing_input.begin(), increasing_input.end()); // Select lhs from input up to pivot point offset_t pivot_point_val = pivot_point[0]; auto const end_lhs = thrust::copy_if( - data_gen_it, data_gen_it + pivot_point_val, rnd_selector_val.cbegin(), keys_lhs.begin(), select_lhs_op); + increasing_input.cbegin(), + increasing_input.cbegin() + pivot_point_val, + rnd_selector_val.cbegin(), + keys_lhs.begin(), + select_lhs_op); + // Select rhs items from input up to pivot point auto const end_rhs = thrust::copy_if( - data_gen_it, data_gen_it + pivot_point_val, rnd_selector_val.cbegin(), keys_rhs.begin(), select_rhs_op); + increasing_input.cbegin(), + increasing_input.cbegin() + pivot_point_val, + rnd_selector_val.cbegin(), + keys_rhs.begin(), + select_rhs_op); // From pivot point copy all remaining items to rhs - thrust::copy(data_gen_it + pivot_point_val, data_gen_it + elements, end_rhs); + thrust::copy(increasing_input.cbegin() + pivot_point_val, increasing_input.cbegin() + elements, end_rhs); key_t* d_keys_lhs = thrust::raw_pointer_cast(keys_lhs.data()); key_t* d_keys_rhs = thrust::raw_pointer_cast(keys_rhs.data()); @@ -240,9 +178,6 @@ using value_types = nvbench::type_list; #endif // TUNE_ValueT -// using key_types = nvbench::type_list; -// using value_types = nvbench::type_list; - NVBENCH_BENCH_TYPES(pairs, NVBENCH_TYPE_AXES(key_types, value_types, offset_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})