From 32c246b9edc83003a73ecbf5b0396e5ea209f4d2 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 25 Feb 2025 10:38:32 +0100 Subject: [PATCH 1/3] Qualifiers --- thrust/testing/tuple_reduce.cu | 2 +- thrust/testing/tuple_scan.cu | 2 +- thrust/testing/tuple_sort.cu | 16 ++++++++-------- thrust/testing/tuple_transform.cu | 6 +++--- thrust/testing/zip_function.cu | 25 ++++++++++++------------ thrust/testing/zip_iterator.cu | 32 +++++++++++++++---------------- 6 files changed, 42 insertions(+), 41 deletions(-) diff --git a/thrust/testing/tuple_reduce.cu b/thrust/testing/tuple_reduce.cu index c7f7a42f7dc..2bcfab751a7 100644 --- a/thrust/testing/tuple_reduce.cu +++ b/thrust/testing/tuple_reduce.cu @@ -38,7 +38,7 @@ struct TestTupleReduce // zip up the data host_vector> h_tuples(n); - transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); + thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); // copy to device device_vector> d_tuples = h_tuples; diff --git a/thrust/testing/tuple_scan.cu b/thrust/testing/tuple_scan.cu index 298c2d87389..2a22e452734 100644 --- a/thrust/testing/tuple_scan.cu +++ b/thrust/testing/tuple_scan.cu @@ -42,7 +42,7 @@ struct TestTupleScan // initialize input host_vector> h_input(n); - transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_input.begin(), MakeTupleFunctor()); + thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_input.begin(), MakeTupleFunctor()); device_vector> d_input = h_input; // allocate output diff --git a/thrust/testing/tuple_sort.cu b/thrust/testing/tuple_sort.cu index 0263ebbd8d5..14156235f0d 100644 --- a/thrust/testing/tuple_sort.cu +++ b/thrust/testing/tuple_sort.cu @@ -37,30 +37,30 @@ struct TestTupleStableSort // zip up the data host_vector> h_tuples(n); - transform(h_keys.begin(), h_keys.end(), h_values.begin(), h_tuples.begin(), MakeTupleFunctor()); + thrust::transform(h_keys.begin(), h_keys.end(), h_values.begin(), h_tuples.begin(), MakeTupleFunctor()); // copy to device device_vector> d_tuples = h_tuples; // sort on host - stable_sort(h_tuples.begin(), h_tuples.end()); + thrust::stable_sort(h_tuples.begin(), h_tuples.end()); // sort on device - stable_sort(d_tuples.begin(), d_tuples.end()); + thrust::stable_sort(d_tuples.begin(), d_tuples.end()); - ASSERT_EQUAL(true, is_sorted(d_tuples.begin(), d_tuples.end())); + ASSERT_EQUAL(true, thrust::is_sorted(d_tuples.begin(), d_tuples.end())); // select keys - transform(h_tuples.begin(), h_tuples.end(), h_keys.begin(), GetFunctor<0>()); + thrust::transform(h_tuples.begin(), h_tuples.end(), h_keys.begin(), GetFunctor<0>()); device_vector d_keys(h_keys.size()); - transform(d_tuples.begin(), d_tuples.end(), d_keys.begin(), GetFunctor<0>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_keys.begin(), GetFunctor<0>()); // select values - transform(h_tuples.begin(), h_tuples.end(), h_values.begin(), GetFunctor<1>()); + thrust::transform(h_tuples.begin(), h_tuples.end(), h_values.begin(), GetFunctor<1>()); device_vector d_values(h_values.size()); - transform(d_tuples.begin(), d_tuples.end(), d_values.begin(), GetFunctor<1>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_values.begin(), GetFunctor<1>()); ASSERT_ALMOST_EQUAL(h_keys, d_keys); ASSERT_ALMOST_EQUAL(h_values, d_values); diff --git a/thrust/testing/tuple_transform.cu b/thrust/testing/tuple_transform.cu index 2b6dcc96c3f..a77bf2ad647 100644 --- a/thrust/testing/tuple_transform.cu +++ b/thrust/testing/tuple_transform.cu @@ -36,7 +36,7 @@ struct TestTupleTransform // zip up the data host_vector> h_tuples(n); - transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); + thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); // copy to device device_vector> d_tuples = h_tuples; @@ -44,10 +44,10 @@ struct TestTupleTransform device_vector d_t1(n), d_t2(n); // select 0th - transform(d_tuples.begin(), d_tuples.end(), d_t1.begin(), GetFunctor<0>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_t1.begin(), GetFunctor<0>()); // select 1st - transform(d_tuples.begin(), d_tuples.end(), d_t2.begin(), GetFunctor<1>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_t2.begin(), GetFunctor<1>()); ASSERT_ALMOST_EQUAL(h_t1, d_t1); ASSERT_ALMOST_EQUAL(h_t2, d_t2); diff --git a/thrust/testing/zip_function.cu b/thrust/testing/zip_function.cu index 279960592b5..2b2276b9c93 100644 --- a/thrust/testing/zip_function.cu +++ b/thrust/testing/zip_function.cu @@ -60,19 +60,20 @@ struct TestZipFunctionTransform device_vector d_result_zip(n); // Tuple base case - transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), - make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), - h_result_tuple.begin(), - SumThreeTuple{}); + + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), + make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), + h_result_tuple.begin(), + SumThreeTuple{}); // Zip Function - transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), - make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), - h_result_zip.begin(), - make_zip_function(SumThree{})); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), - make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), - d_result_zip.begin(), - make_zip_function(SumThree{})); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), + make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), + h_result_zip.begin(), + make_zip_function(SumThree{})); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), + make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), + d_result_zip.begin(), + make_zip_function(SumThree{})); ASSERT_EQUAL(h_result_tuple, h_result_zip); ASSERT_EQUAL(h_result_tuple, d_result_zip); diff --git a/thrust/testing/zip_iterator.cu b/thrust/testing/zip_iterator.cu index 786b839099b..47b8033bfb1 100644 --- a/thrust/testing/zip_iterator.cu +++ b/thrust/testing/zip_iterator.cu @@ -332,25 +332,25 @@ struct TestZipIteratorTransform device_vector d_result(n); // Tuples with 2 elements - transform(make_zip_iterator(h_data0.begin(), h_data1.begin()), - make_zip_iterator(h_data0.end(), h_data1.end()), - h_result.begin(), - SumTwoTuple()); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin()), - make_zip_iterator(d_data0.end(), d_data1.end()), - d_result.begin(), - SumTwoTuple()); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin()), + make_zip_iterator(h_data0.end(), h_data1.end()), + h_result.begin(), + SumTwoTuple()); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin()), + make_zip_iterator(d_data0.end(), d_data1.end()), + d_result.begin(), + SumTwoTuple()); ASSERT_EQUAL(h_result, d_result); // Tuples with 3 elements - transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), - make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), - h_result.begin(), - SumThreeTuple()); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), - make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), - d_result.begin(), - SumThreeTuple()); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), + make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), + h_result.begin(), + SumThreeTuple()); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), + make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), + d_result.begin(), + SumThreeTuple()); ASSERT_EQUAL(h_result, d_result); } }; From a7d093a2c6c4de56cbab7a819eba0514d234c40b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 24 Feb 2025 20:15:28 +0100 Subject: [PATCH 2/3] Replace CUB macros in tunings and benchmarks --- c/parallel/src/reduce.cu | 5 +++-- cub/benchmarks/bench/partition/flagged.cu | 2 +- cub/benchmarks/bench/partition/if.cu | 2 +- cub/benchmarks/bench/select/flagged.cu | 2 +- cub/benchmarks/bench/select/if.cu | 2 +- cub/benchmarks/bench/select/unique.cu | 4 ++-- .../device/dispatch/tuning/tuning_run_length_encode.cuh | 9 +++++---- cub/cub/device/dispatch/tuning/tuning_select_if.cuh | 5 +++-- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 383cfc0a895..575a5748ed6 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -14,6 +14,7 @@ #include #include +#include #include #include // ::cuda::std::identity #include @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type) auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain); // Implement part of MemBoundScaling - items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2)); - block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); + items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2); + block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); return {block_size, items_per_thread, vector_load_length}; } diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index 7217ee32e6e..30102f34a48 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -63,7 +63,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy #include +#include +#include #include CUB_NAMESPACE_BEGIN @@ -315,9 +317,8 @@ struct policy_hub static constexpr int items = (max_input_bytes <= 8) ? 6 - // TODO(bgruber): use clamp() and ceil_div in C++14 - : CUB_MIN(nominal_4B_items_per_thread, - CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, items, @@ -603,7 +604,7 @@ struct policy_hub static constexpr int nominal_4B_items_per_thread = 15; // TODO(bgruber): use clamp() in C++14 static constexpr int ITEMS_PER_THREAD = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT)))); + _CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread); using RleSweepPolicyT = AgentRlePolicy<96, ITEMS_PER_THREAD, diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index d3f99ee8ee8..620de4692d2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -45,6 +45,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -1498,9 +1500,8 @@ struct policy_hub struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 10; - // TODO(bgruber): use cuda::std::clamp() in C++14 static constexpr int items_per_thread = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT)))); + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread); using SelectIfPolicyT = AgentSelectIfPolicy<128, items_per_thread, From 576c6566b05561137d66b1b70651466ac74441a7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 25 Feb 2025 09:52:16 +0100 Subject: [PATCH 3/3] Use __algorithm_ --- c/parallel/src/reduce.cu | 2 +- cub/benchmarks/bench/partition/flagged.cu | 1 + cub/benchmarks/bench/partition/if.cu | 1 + cub/benchmarks/bench/select/flagged.cu | 2 ++ cub/benchmarks/bench/select/if.cu | 2 ++ cub/benchmarks/bench/select/unique.cu | 2 ++ cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh | 3 +-- cub/cub/device/dispatch/tuning/tuning_select_if.cuh | 2 +- 8 files changed, 11 insertions(+), 4 deletions(-) diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 575a5748ed6..4b458022c33 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include // ::cuda::std::identity #include diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index 30102f34a48..0a41f88f1a4 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -29,6 +29,7 @@ #include +#include #include #include diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu index f9a8bc7af4b..5dc240fb2e7 100644 --- a/cub/benchmarks/bench/partition/if.cu +++ b/cub/benchmarks/bench/partition/if.cu @@ -29,6 +29,7 @@ #include +#include #include #include diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu index 189e8ea132d..168e2bc0cc5 100644 --- a/cub/benchmarks/bench/select/flagged.cu +++ b/cub/benchmarks/bench/select/flagged.cu @@ -29,6 +29,8 @@ #include +#include + #include #include diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu index 4cdd04b01e2..98bdec915a7 100644 --- a/cub/benchmarks/bench/select/if.cu +++ b/cub/benchmarks/bench/select/if.cu @@ -29,6 +29,8 @@ #include +#include + #include #include diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu index 5f08518abb9..75c0c3771e0 100644 --- a/cub/benchmarks/bench/select/unique.cu +++ b/cub/benchmarks/bench/select/unique.cu @@ -3,6 +3,8 @@ #include +#include + #include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 3886181703f..9f272961ef2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -47,8 +47,7 @@ #include #include -#include -#include +#include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 620de4692d2..ea2971dd3d2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -45,7 +45,7 @@ #include #include -#include +#include CUB_NAMESPACE_BEGIN