From a52e6263d39bee5828c48a6c9d1de484c60bc382 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 26 Feb 2025 15:17:59 +0100 Subject: [PATCH] Replace CUB macros in tunings and benchmarks (#3931) --- c/parallel/src/reduce.cu | 5 +-- cub/benchmarks/bench/partition/flagged.cu | 3 +- cub/benchmarks/bench/partition/if.cu | 3 +- cub/benchmarks/bench/select/flagged.cu | 4 ++- cub/benchmarks/bench/select/if.cu | 4 ++- cub/benchmarks/bench/select/unique.cu | 6 ++-- .../tuning/tuning_run_length_encode.cuh | 10 +++--- .../dispatch/tuning/tuning_select_if.cuh | 5 +-- thrust/testing/zip_iterator.cu | 32 +++++++++---------- 9 files changed, 41 insertions(+), 31 deletions(-) diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 383cfc0a895..4b458022c33 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -14,6 +14,7 @@ #include #include +#include #include #include // ::cuda::std::identity #include @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type) auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain); // Implement part of MemBoundScaling - items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2)); - block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); + items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2); + block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); return {block_size, items_per_thread, vector_load_length}; } diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index 7217ee32e6e..0a41f88f1a4 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -29,6 +29,7 @@ #include +#include #include #include @@ -63,7 +64,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include #include #include @@ -63,7 +64,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -61,7 +63,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -63,7 +65,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -36,8 +38,8 @@ struct policy_hub_t { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; - static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min( + NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = cub::AgentSelectIfPolicy #include -#include +#include +#include CUB_NAMESPACE_BEGIN @@ -315,9 +316,8 @@ struct policy_hub static constexpr int items = (max_input_bytes <= 8) ? 6 - // TODO(bgruber): use clamp() and ceil_div in C++14 - : CUB_MIN(nominal_4B_items_per_thread, - CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, items, @@ -603,7 +603,7 @@ struct policy_hub static constexpr int nominal_4B_items_per_thread = 15; // TODO(bgruber): use clamp() in C++14 static constexpr int ITEMS_PER_THREAD = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT)))); + _CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread); using RleSweepPolicyT = AgentRlePolicy<96, ITEMS_PER_THREAD, diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index d3f99ee8ee8..ea2971dd3d2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -45,6 +45,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -1498,9 +1500,8 @@ struct policy_hub struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 10; - // TODO(bgruber): use cuda::std::clamp() in C++14 static constexpr int items_per_thread = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT)))); + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread); using SelectIfPolicyT = AgentSelectIfPolicy<128, items_per_thread, diff --git a/thrust/testing/zip_iterator.cu b/thrust/testing/zip_iterator.cu index 786b839099b..47b8033bfb1 100644 --- a/thrust/testing/zip_iterator.cu +++ b/thrust/testing/zip_iterator.cu @@ -332,25 +332,25 @@ struct TestZipIteratorTransform device_vector d_result(n); // Tuples with 2 elements - transform(make_zip_iterator(h_data0.begin(), h_data1.begin()), - make_zip_iterator(h_data0.end(), h_data1.end()), - h_result.begin(), - SumTwoTuple()); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin()), - make_zip_iterator(d_data0.end(), d_data1.end()), - d_result.begin(), - SumTwoTuple()); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin()), + make_zip_iterator(h_data0.end(), h_data1.end()), + h_result.begin(), + SumTwoTuple()); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin()), + make_zip_iterator(d_data0.end(), d_data1.end()), + d_result.begin(), + SumTwoTuple()); ASSERT_EQUAL(h_result, d_result); // Tuples with 3 elements - transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), - make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), - h_result.begin(), - SumThreeTuple()); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), - make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), - d_result.begin(), - SumThreeTuple()); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), + make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), + h_result.begin(), + SumThreeTuple()); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), + make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), + d_result.begin(), + SumThreeTuple()); ASSERT_EQUAL(h_result, d_result); } };