diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 383cfc0a895..4b458022c33 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -14,6 +14,7 @@ #include #include +#include #include #include // ::cuda::std::identity #include @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type) auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain); // Implement part of MemBoundScaling - items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2)); - block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); + items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2); + block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); return {block_size, items_per_thread, vector_load_length}; } diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index 7217ee32e6e..0a41f88f1a4 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -29,6 +29,7 @@ #include +#include #include #include @@ -63,7 +64,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include #include #include @@ -63,7 +64,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -61,7 +63,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -63,7 +65,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -36,8 +38,8 @@ struct policy_hub_t { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; - static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min( + NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = cub::AgentSelectIfPolicy #include -#include +#include +#include CUB_NAMESPACE_BEGIN @@ -315,9 +316,8 @@ struct policy_hub static constexpr int items = (max_input_bytes <= 8) ? 6 - // TODO(bgruber): use clamp() and ceil_div in C++14 - : CUB_MIN(nominal_4B_items_per_thread, - CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, items, @@ -603,7 +603,7 @@ struct policy_hub static constexpr int nominal_4B_items_per_thread = 15; // TODO(bgruber): use clamp() in C++14 static constexpr int ITEMS_PER_THREAD = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT)))); + _CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread); using RleSweepPolicyT = AgentRlePolicy<96, ITEMS_PER_THREAD, diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index d3f99ee8ee8..ea2971dd3d2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -45,6 +45,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -1498,9 +1500,8 @@ struct policy_hub struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 10; - // TODO(bgruber): use cuda::std::clamp() in C++14 static constexpr int items_per_thread = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT)))); + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread); using SelectIfPolicyT = AgentSelectIfPolicy<128, items_per_thread, diff --git a/thrust/testing/tuple_reduce.cu b/thrust/testing/tuple_reduce.cu index c7f7a42f7dc..2bcfab751a7 100644 --- a/thrust/testing/tuple_reduce.cu +++ b/thrust/testing/tuple_reduce.cu @@ -38,7 +38,7 @@ struct TestTupleReduce // zip up the data host_vector> h_tuples(n); - transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); + thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); // copy to device device_vector> d_tuples = h_tuples; diff --git a/thrust/testing/tuple_scan.cu b/thrust/testing/tuple_scan.cu index 298c2d87389..2a22e452734 100644 --- a/thrust/testing/tuple_scan.cu +++ b/thrust/testing/tuple_scan.cu @@ -42,7 +42,7 @@ struct TestTupleScan // initialize input host_vector> h_input(n); - transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_input.begin(), MakeTupleFunctor()); + thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_input.begin(), MakeTupleFunctor()); device_vector> d_input = h_input; // allocate output diff --git a/thrust/testing/tuple_sort.cu b/thrust/testing/tuple_sort.cu index 0263ebbd8d5..14156235f0d 100644 --- a/thrust/testing/tuple_sort.cu +++ b/thrust/testing/tuple_sort.cu @@ -37,30 +37,30 @@ struct TestTupleStableSort // zip up the data host_vector> h_tuples(n); - transform(h_keys.begin(), h_keys.end(), h_values.begin(), h_tuples.begin(), MakeTupleFunctor()); + thrust::transform(h_keys.begin(), h_keys.end(), h_values.begin(), h_tuples.begin(), MakeTupleFunctor()); // copy to device device_vector> d_tuples = h_tuples; // sort on host - stable_sort(h_tuples.begin(), h_tuples.end()); + thrust::stable_sort(h_tuples.begin(), h_tuples.end()); // sort on device - stable_sort(d_tuples.begin(), d_tuples.end()); + thrust::stable_sort(d_tuples.begin(), d_tuples.end()); - ASSERT_EQUAL(true, is_sorted(d_tuples.begin(), d_tuples.end())); + ASSERT_EQUAL(true, thrust::is_sorted(d_tuples.begin(), d_tuples.end())); // select keys - transform(h_tuples.begin(), h_tuples.end(), h_keys.begin(), GetFunctor<0>()); + thrust::transform(h_tuples.begin(), h_tuples.end(), h_keys.begin(), GetFunctor<0>()); device_vector d_keys(h_keys.size()); - transform(d_tuples.begin(), d_tuples.end(), d_keys.begin(), GetFunctor<0>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_keys.begin(), GetFunctor<0>()); // select values - transform(h_tuples.begin(), h_tuples.end(), h_values.begin(), GetFunctor<1>()); + thrust::transform(h_tuples.begin(), h_tuples.end(), h_values.begin(), GetFunctor<1>()); device_vector d_values(h_values.size()); - transform(d_tuples.begin(), d_tuples.end(), d_values.begin(), GetFunctor<1>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_values.begin(), GetFunctor<1>()); ASSERT_ALMOST_EQUAL(h_keys, d_keys); ASSERT_ALMOST_EQUAL(h_values, d_values); diff --git a/thrust/testing/tuple_transform.cu b/thrust/testing/tuple_transform.cu index 2b6dcc96c3f..a77bf2ad647 100644 --- a/thrust/testing/tuple_transform.cu +++ b/thrust/testing/tuple_transform.cu @@ -36,7 +36,7 @@ struct TestTupleTransform // zip up the data host_vector> h_tuples(n); - transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); + thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor()); // copy to device device_vector> d_tuples = h_tuples; @@ -44,10 +44,10 @@ struct TestTupleTransform device_vector d_t1(n), d_t2(n); // select 0th - transform(d_tuples.begin(), d_tuples.end(), d_t1.begin(), GetFunctor<0>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_t1.begin(), GetFunctor<0>()); // select 1st - transform(d_tuples.begin(), d_tuples.end(), d_t2.begin(), GetFunctor<1>()); + thrust::transform(d_tuples.begin(), d_tuples.end(), d_t2.begin(), GetFunctor<1>()); ASSERT_ALMOST_EQUAL(h_t1, d_t1); ASSERT_ALMOST_EQUAL(h_t2, d_t2); diff --git a/thrust/testing/zip_function.cu b/thrust/testing/zip_function.cu index 279960592b5..2b2276b9c93 100644 --- a/thrust/testing/zip_function.cu +++ b/thrust/testing/zip_function.cu @@ -60,19 +60,20 @@ struct TestZipFunctionTransform device_vector d_result_zip(n); // Tuple base case - transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), - make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), - h_result_tuple.begin(), - SumThreeTuple{}); + + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), + make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), + h_result_tuple.begin(), + SumThreeTuple{}); // Zip Function - transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), - make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), - h_result_zip.begin(), - make_zip_function(SumThree{})); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), - make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), - d_result_zip.begin(), - make_zip_function(SumThree{})); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), + make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), + h_result_zip.begin(), + make_zip_function(SumThree{})); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), + make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), + d_result_zip.begin(), + make_zip_function(SumThree{})); ASSERT_EQUAL(h_result_tuple, h_result_zip); ASSERT_EQUAL(h_result_tuple, d_result_zip); diff --git a/thrust/testing/zip_iterator.cu b/thrust/testing/zip_iterator.cu index 786b839099b..47b8033bfb1 100644 --- a/thrust/testing/zip_iterator.cu +++ b/thrust/testing/zip_iterator.cu @@ -332,25 +332,25 @@ struct TestZipIteratorTransform device_vector d_result(n); // Tuples with 2 elements - transform(make_zip_iterator(h_data0.begin(), h_data1.begin()), - make_zip_iterator(h_data0.end(), h_data1.end()), - h_result.begin(), - SumTwoTuple()); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin()), - make_zip_iterator(d_data0.end(), d_data1.end()), - d_result.begin(), - SumTwoTuple()); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin()), + make_zip_iterator(h_data0.end(), h_data1.end()), + h_result.begin(), + SumTwoTuple()); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin()), + make_zip_iterator(d_data0.end(), d_data1.end()), + d_result.begin(), + SumTwoTuple()); ASSERT_EQUAL(h_result, d_result); // Tuples with 3 elements - transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), - make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), - h_result.begin(), - SumThreeTuple()); - transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), - make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), - d_result.begin(), - SumThreeTuple()); + thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()), + make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()), + h_result.begin(), + SumThreeTuple()); + thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()), + make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()), + d_result.begin(), + SumThreeTuple()); ASSERT_EQUAL(h_result, d_result); } };