From 7286b539340582bcb7f83dfff72addb44cf22975 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 24 Feb 2025 20:15:28 +0100 Subject: [PATCH] Replace CUB macros in tunings and benchmarks --- c/parallel/src/reduce.cu | 5 +++-- cub/benchmarks/bench/partition/flagged.cu | 2 +- cub/benchmarks/bench/partition/if.cu | 2 +- cub/benchmarks/bench/select/flagged.cu | 2 +- cub/benchmarks/bench/select/if.cu | 2 +- cub/benchmarks/bench/select/unique.cu | 4 ++-- .../device/dispatch/tuning/tuning_run_length_encode.cuh | 9 +++++---- cub/cub/device/dispatch/tuning/tuning_select_if.cuh | 5 +++-- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 383cfc0a895..575a5748ed6 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -14,6 +14,7 @@ #include #include +#include #include #include // ::cuda::std::identity #include @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type) auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain); // Implement part of MemBoundScaling - items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2)); - block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); + items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2); + block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); return {block_size, items_per_thread, vector_load_length}; } diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index 7217ee32e6e..30102f34a48 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -63,7 +63,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy #include +#include +#include #include CUB_NAMESPACE_BEGIN @@ -315,9 +317,8 @@ struct policy_hub static constexpr int items = (max_input_bytes <= 8) ? 6 - // TODO(bgruber): use clamp() and ceil_div in C++14 - : CUB_MIN(nominal_4B_items_per_thread, - CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, items, @@ -603,7 +604,7 @@ struct policy_hub static constexpr int nominal_4B_items_per_thread = 15; // TODO(bgruber): use clamp() in C++14 static constexpr int ITEMS_PER_THREAD = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT)))); + _CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread); using RleSweepPolicyT = AgentRlePolicy<96, ITEMS_PER_THREAD, diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index d3f99ee8ee8..620de4692d2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -45,6 +45,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -1498,9 +1500,8 @@ struct policy_hub struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 10; - // TODO(bgruber): use cuda::std::clamp() in C++14 static constexpr int items_per_thread = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT)))); + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread); using SelectIfPolicyT = AgentSelectIfPolicy<128, items_per_thread,