NVIDIA · bernhardmgruber · Feb 26, 2025 · Feb 25, 2025 · Feb 24, 2025 · Feb 25, 2025
@@ -14,6 +14,7 @@
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/util_device.cuh>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/cstdint>
 #include <cuda/std/functional> // ::cuda::std::identity
 #include <cuda/std/variant>
@@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
   auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);
 
   // Implement part of MemBoundScaling
-  items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
-  block_size       = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
+  items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
+  block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
 
   return {block_size, items_per_thread, vector_load_length};
 }

@@ -29,6 +29,7 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/type_traits>
 
 #include <look_back_helper.cuh>
@@ -63,7 +64,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -29,6 +29,7 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/type_traits>
 
 #include <look_back_helper.cuh>
@@ -63,7 +64,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -29,6 +29,8 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
+
 #include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
@@ -61,7 +63,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -29,6 +29,8 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
+
 #include <limits>
 
 #include <look_back_helper.cuh>
@@ -63,7 +65,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -3,6 +3,8 @@
 
 #include <cub/device/device_select.cuh>
 
+#include <cuda/std/__algorithm_>
+
 #include <limits>
 
 #include <look_back_helper.cuh>
@@ -36,8 +38,8 @@ struct policy_hub_t
   {
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
-    static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+    static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
+      NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -46,7 +46,8 @@
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/std/__algorithm/max.h>
+#include <cuda/cmath>
+#include <cuda/std/__algorithm_>
 
 CUB_NAMESPACE_BEGIN
 
@@ -315,9 +316,8 @@ struct policy_hub
     static constexpr int items =
       (max_input_bytes <= 8)
         ? 6
-        // TODO(bgruber): use clamp() and ceil_div in C++14
-        : CUB_MIN(nominal_4B_items_per_thread,
-                  CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
+        : ::cuda::std::clamp(
+            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
     using ReduceByKeyPolicyT =
       AgentReduceByKeyPolicy<128,
                              items,
@@ -603,7 +603,7 @@ struct policy_hub
     static constexpr int nominal_4B_items_per_thread = 15;
     // TODO(bgruber): use clamp() in C++14
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT))));
+      _CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread);
     using RleSweepPolicyT =
       AgentRlePolicy<96,
                      ITEMS_PER_THREAD,

@@ -45,6 +45,8 @@
 #include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm_>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -1498,9 +1500,8 @@ struct policy_hub
   struct DefaultPolicy
   {
     static constexpr int nominal_4B_items_per_thread = 10;
-    // TODO(bgruber): use cuda::std::clamp() in C++14
     static constexpr int items_per_thread =
-      CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT))));
+      ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread);
     using SelectIfPolicyT =
       AgentSelectIfPolicy<128,
                           items_per_thread,

@@ -38,7 +38,7 @@ struct TestTupleReduce
 
     // zip up the data
     host_vector<tuple<T, T>> h_tuples(n);
-    transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor());
+    thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor());
 
     // copy to device
     device_vector<tuple<T, T>> d_tuples = h_tuples;

@@ -42,7 +42,7 @@ struct TestTupleScan
 
     // initialize input
     host_vector<tuple<T, T>> h_input(n);
-    transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_input.begin(), MakeTupleFunctor());
+    thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_input.begin(), MakeTupleFunctor());
     device_vector<tuple<T, T>> d_input = h_input;
 
     // allocate output

@@ -37,30 +37,30 @@ struct TestTupleStableSort
 
     // zip up the data
     host_vector<tuple<T, T>> h_tuples(n);
-    transform(h_keys.begin(), h_keys.end(), h_values.begin(), h_tuples.begin(), MakeTupleFunctor());
+    thrust::transform(h_keys.begin(), h_keys.end(), h_values.begin(), h_tuples.begin(), MakeTupleFunctor());
 
     // copy to device
     device_vector<tuple<T, T>> d_tuples = h_tuples;
 
     // sort on host
-    stable_sort(h_tuples.begin(), h_tuples.end());
+    thrust::stable_sort(h_tuples.begin(), h_tuples.end());
 
     // sort on device
-    stable_sort(d_tuples.begin(), d_tuples.end());
+    thrust::stable_sort(d_tuples.begin(), d_tuples.end());
 
-    ASSERT_EQUAL(true, is_sorted(d_tuples.begin(), d_tuples.end()));
+    ASSERT_EQUAL(true, thrust::is_sorted(d_tuples.begin(), d_tuples.end()));
 
     // select keys
-    transform(h_tuples.begin(), h_tuples.end(), h_keys.begin(), GetFunctor<0>());
+    thrust::transform(h_tuples.begin(), h_tuples.end(), h_keys.begin(), GetFunctor<0>());
 
     device_vector<T> d_keys(h_keys.size());
-    transform(d_tuples.begin(), d_tuples.end(), d_keys.begin(), GetFunctor<0>());
+    thrust::transform(d_tuples.begin(), d_tuples.end(), d_keys.begin(), GetFunctor<0>());
 
     // select values
-    transform(h_tuples.begin(), h_tuples.end(), h_values.begin(), GetFunctor<1>());
+    thrust::transform(h_tuples.begin(), h_tuples.end(), h_values.begin(), GetFunctor<1>());
 
     device_vector<T> d_values(h_values.size());
-    transform(d_tuples.begin(), d_tuples.end(), d_values.begin(), GetFunctor<1>());
+    thrust::transform(d_tuples.begin(), d_tuples.end(), d_values.begin(), GetFunctor<1>());
 
     ASSERT_ALMOST_EQUAL(h_keys, d_keys);
     ASSERT_ALMOST_EQUAL(h_values, d_values);

@@ -36,18 +36,18 @@ struct TestTupleTransform
 
     // zip up the data
     host_vector<tuple<T, T>> h_tuples(n);
-    transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor());
+    thrust::transform(h_t1.begin(), h_t1.end(), h_t2.begin(), h_tuples.begin(), MakeTupleFunctor());
 
     // copy to device
     device_vector<tuple<T, T>> d_tuples = h_tuples;
 
     device_vector<T> d_t1(n), d_t2(n);
 
     // select 0th
-    transform(d_tuples.begin(), d_tuples.end(), d_t1.begin(), GetFunctor<0>());
+    thrust::transform(d_tuples.begin(), d_tuples.end(), d_t1.begin(), GetFunctor<0>());
 
     // select 1st
-    transform(d_tuples.begin(), d_tuples.end(), d_t2.begin(), GetFunctor<1>());
+    thrust::transform(d_tuples.begin(), d_tuples.end(), d_t2.begin(), GetFunctor<1>());
 
     ASSERT_ALMOST_EQUAL(h_t1, d_t1);
     ASSERT_ALMOST_EQUAL(h_t2, d_t2);

@@ -60,19 +60,20 @@ struct TestZipFunctionTransform
     device_vector<T> d_result_zip(n);
 
     // Tuple base case
-    transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
-              make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
-              h_result_tuple.begin(),
-              SumThreeTuple{});
+
+    thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
+                      make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
+                      h_result_tuple.begin(),
+                      SumThreeTuple{});
     // Zip Function
-    transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
-              make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
-              h_result_zip.begin(),
-              make_zip_function(SumThree{}));
-    transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
-              make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
-              d_result_zip.begin(),
-              make_zip_function(SumThree{}));
+    thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
+                      make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
+                      h_result_zip.begin(),
+                      make_zip_function(SumThree{}));
+    thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
+                      make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
+                      d_result_zip.begin(),
+                      make_zip_function(SumThree{}));
 
     ASSERT_EQUAL(h_result_tuple, h_result_zip);
     ASSERT_EQUAL(h_result_tuple, d_result_zip);

@@ -332,25 +332,25 @@ struct TestZipIteratorTransform
     device_vector<T> d_result(n);
 
     // Tuples with 2 elements
-    transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
-              make_zip_iterator(h_data0.end(), h_data1.end()),
-              h_result.begin(),
-              SumTwoTuple());
-    transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
-              make_zip_iterator(d_data0.end(), d_data1.end()),
-              d_result.begin(),
-              SumTwoTuple());
+    thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
+                      make_zip_iterator(h_data0.end(), h_data1.end()),
+                      h_result.begin(),
+                      SumTwoTuple());
+    thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
+                      make_zip_iterator(d_data0.end(), d_data1.end()),
+                      d_result.begin(),
+                      SumTwoTuple());
     ASSERT_EQUAL(h_result, d_result);
 
     // Tuples with 3 elements
-    transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
-              make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
-              h_result.begin(),
-              SumThreeTuple());
-    transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
-              make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
-              d_result.begin(),
-              SumThreeTuple());
+    thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
+                      make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
+                      h_result.begin(),
+                      SumThreeTuple());
+    thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
+                      make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
+                      d_result.begin(),
+                      SumThreeTuple());
     ASSERT_EQUAL(h_result, d_result);
   }
 };