Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replaces bool template parameters on Dispatch* class templates to use enum class #3643

Merged
merged 15 commits into from
Feb 4, 2025
Merged
22 changes: 15 additions & 7 deletions cub/benchmarks/bench/adjacent_difference/subtract_left.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,28 @@ struct policy_hub_t
template <class T, class OffsetT>
void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
constexpr bool may_alias = false;
constexpr bool read_left = true;

using input_it_t = const T*;
using output_it_t = T*;
using difference_op_t = ::cuda::std::minus<>;
using offset_t = cub::detail::choose_offset_t<OffsetT>;

#if !TUNE_BASE
using dispatch_t = cub::
DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left, policy_hub_t>;
using dispatch_t = cub::DispatchAdjacentDifference<
input_it_t,
output_it_t,
difference_op_t,
offset_t,
cub::MayAlias::No,
cub::ReadOption::Left,
policy_hub_t>;
#else
using dispatch_t =
cub::DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left>;
using dispatch_t = cub::DispatchAdjacentDifference<
input_it_t,
output_it_t,
difference_op_t,
offset_t,
cub::MayAlias::No,
cub::ReadOption::Left>;
#endif // TUNE_BASE

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
6 changes: 2 additions & 4 deletions cub/benchmarks/bench/copy/memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
*
******************************************************************************/

#include <cub/device/device_copy.cuh>
#include <cub/device/device_memcpy.cuh>

// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% TUNE_BUFFERS_PER_THREAD bpt 1:18:1
Expand Down Expand Up @@ -184,8 +184,6 @@ void copy(nvbench::state& state,
using buffer_offset_t = std::uint32_t;
using block_offset_t = std::uint32_t;

constexpr bool is_memcpy = true;

#if !TUNE_BASE
using policy_t = policy_hub_t;
#else
Expand All @@ -199,7 +197,7 @@ void copy(nvbench::state& state,
buffer_offset_t,
block_offset_t,
policy_t,
is_memcpy>;
cub::CopyAlg::Memcpy>;

thrust::device_vector<T> input_buffer = generate(elements);
thrust::device_vector<T> output_buffer(elements);
Expand Down
6 changes: 1 addition & 5 deletions cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@
// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5

constexpr bool keep_rejects = true;
constexpr bool may_alias = false;

#if !TUNE_BASE
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
Expand Down Expand Up @@ -119,8 +116,7 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias
cub::SelectImpl::Partition
#if !TUNE_BASE
,
policy_hub_t<T>
Expand Down
6 changes: 1 addition & 5 deletions cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@
// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5

constexpr bool keep_rejects = true;
constexpr bool may_alias = false;

#if !TUNE_BASE
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
Expand Down Expand Up @@ -145,8 +142,7 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias
cub::SelectImpl::Partition
#if !TUNE_BASE
,
policy_hub_t<T>
Expand Down
14 changes: 7 additions & 7 deletions cub/benchmarks/bench/radix_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@

using value_t = cub::NullType;

constexpr bool is_descending = false;
constexpr bool is_overwrite_ok = false;
constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
constexpr bool is_overwrite_ok = false;

#if !TUNE_BASE
template <typename KeyT, typename ValueT, typename OffsetT>
Expand Down Expand Up @@ -105,10 +105,10 @@ constexpr std::size_t max_onesweep_temp_storage_size()
using portion_offset = int;
using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
using agent_radix_sort_onesweep_t =
cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
cub::AgentRadixSortOnesweep<onesweep_policy, sort_order, KeyT, ValueT, OffsetT, portion_offset>;

using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
using hist_agent = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
using hist_agent = cub::AgentRadixSortHistogram<hist_policy, sort_order, KeyT, OffsetT>;

return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
sizeof(typename hist_agent::TempStorage));
Expand Down Expand Up @@ -144,9 +144,9 @@ void radix_sort_keys(std::integral_constant<bool, true>, nvbench::state& state,
using key_t = T;
#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, value_t, offset_t>;
using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t, policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t>;
#endif // TUNE_BASE

constexpr int begin_bit = 0;
Expand Down Expand Up @@ -207,7 +207,7 @@ void radix_sort_keys(std::integral_constant<bool, true>, nvbench::state& state,
template <typename T, typename OffsetT>
void radix_sort_keys(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<T, OffsetT>)
{
(void) is_descending;
(void) sort_order;
(void) is_overwrite_ok;
}

Expand Down
14 changes: 7 additions & 7 deletions cub/benchmarks/bench/radix_sort/pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32

constexpr bool is_descending = false;
constexpr bool is_overwrite_ok = false;
constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
constexpr bool is_overwrite_ok = false;

#if !TUNE_BASE
template <typename KeyT, typename ValueT, typename OffsetT>
Expand Down Expand Up @@ -103,10 +103,10 @@ constexpr std::size_t max_onesweep_temp_storage_size()
using portion_offset = int;
using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
using agent_radix_sort_onesweep_t =
cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
cub::AgentRadixSortOnesweep<onesweep_policy, sort_order, KeyT, ValueT, OffsetT, portion_offset>;

using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
using hist_agent = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
using hist_agent = cub::AgentRadixSortHistogram<hist_policy, sort_order, KeyT, OffsetT>;

return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
sizeof(typename hist_agent::TempStorage));
Expand Down Expand Up @@ -144,9 +144,9 @@ void radix_sort_values(
using value_t = ValueT;
#if !TUNE_BASE
using policy_t = policy_hub_t<key_t, value_t, offset_t>;
using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t, policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t>;
#endif // TUNE_BASE

constexpr int begin_bit = 0;
Expand Down Expand Up @@ -212,7 +212,7 @@ void radix_sort_values(
template <typename KeyT, typename ValueT, typename OffsetT>
void radix_sort_values(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<KeyT, ValueT, OffsetT>)
{
(void) is_descending;
(void) sort_order;
(void) is_overwrite_ok;
}

Expand Down
8 changes: 4 additions & 4 deletions cub/benchmarks/bench/segmented_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ void seg_sort(nvbench::state& state,
const thrust::device_vector<OffsetT>& offsets,
bit_entropy entropy)
{
constexpr bool is_descending = false;
constexpr bool is_overwrite_ok = false;
constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
constexpr bool is_overwrite_ok = false;

using offset_t = OffsetT;
using begin_offset_it_t = const offset_t*;
Expand All @@ -165,10 +165,10 @@ void seg_sort(nvbench::state& state,
#if !TUNE_BASE
using policy_t = device_seg_sort_policy_hub<key_t>;
using dispatch_t = //
cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t, policy_t>;
cub::DispatchSegmentedSort<sort_order, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t, policy_t>;
#else
using dispatch_t = //
cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t>;
cub::DispatchSegmentedSort<sort_order, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t>;
#endif

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand Down
59 changes: 28 additions & 31 deletions cub/benchmarks/bench/select/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5

constexpr bool keep_rejects = false;

#if !TUNE_BASE
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
Expand Down Expand Up @@ -78,17 +76,18 @@ struct policy_hub_t
};
#endif // !TUNE_BASE

template <typename T, typename OffsetT, typename MayAlias>
void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
template <typename T, typename OffsetT, typename InPlace>
void select(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlace>)
{
using input_it_t = const T*;
using flag_it_t = const bool*;
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr bool may_alias = MayAlias::value;
using input_it_t = const T*;
using flag_it_t = const bool*;
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr cub::SelectImpl selection_option =
InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
Expand All @@ -100,20 +99,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias,
selection_option,
policy_t>;
#else // TUNE_BASE
using dispatch_t = cub::DispatchSelectIf<
input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
keep_rejects,
may_alias>;
using dispatch_t =
cub::DispatchSelectIf<input_it_t,
flag_it_t,
output_it_t,
num_selected_it_t,
select_op_t,
equality_op_t,
offset_t,
selection_option>;
#endif // !TUNE_BASE

// Retrieve axis parameters
Expand Down Expand Up @@ -165,18 +162,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)

using ::cuda::std::false_type;
using ::cuda::std::true_type;
#ifdef TUNE_MayAlias
using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
#else // !defined(TUNE_MayAlias)
using may_alias = nvbench::type_list<false_type, true_type>;
#endif // TUNE_MayAlias
#ifdef TUNE_InPlace
using is_in_place = nvbench::type_list<TUNE_InPlace>; // expands to "false_type" or "true_type"
#else // !defined(TUNE_InPlace)
using is_in_place = nvbench::type_list<false_type, true_type>;
#endif // TUNE_InPlace

// The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
// using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
using select_offset_types = nvbench::type_list<int64_t>;

NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias))
NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"})
.set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
Loading
Loading