From a52e6263d39bee5828c48a6c9d1de484c60bc382 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 26 Feb 2025 15:17:59 +0100
Subject: [PATCH] Replace CUB macros in tunings and benchmarks (#3931)

---
 c/parallel/src/reduce.cu                      |  5 +--
 cub/benchmarks/bench/partition/flagged.cu     |  3 +-
 cub/benchmarks/bench/partition/if.cu          |  3 +-
 cub/benchmarks/bench/select/flagged.cu        |  4 ++-
 cub/benchmarks/bench/select/if.cu             |  4 ++-
 cub/benchmarks/bench/select/unique.cu         |  6 ++--
 .../tuning/tuning_run_length_encode.cuh       | 10 +++---
 .../dispatch/tuning/tuning_select_if.cuh      |  5 +--
 thrust/testing/zip_iterator.cu                | 32 +++++++++----------
 9 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index 383cfc0a895..4b458022c33 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -14,6 +14,7 @@
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/util_device.cuh>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/cstdint>
 #include <cuda/std/functional> // ::cuda::std::identity
 #include <cuda/std/variant>
@@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
   auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);
 
   // Implement part of MemBoundScaling
-  items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
-  block_size       = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
+  items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
+  block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
 
   return {block_size, items_per_thread, vector_load_length};
 }
diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index 7217ee32e6e..0a41f88f1a4 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -29,6 +29,7 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/type_traits>
 
 #include <look_back_helper.cuh>
@@ -63,7 +64,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
index b5ed0b41b1a..5dc240fb2e7 100644
--- a/cub/benchmarks/bench/partition/if.cu
+++ b/cub/benchmarks/bench/partition/if.cu
@@ -29,6 +29,7 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
 #include <cuda/std/type_traits>
 
 #include <look_back_helper.cuh>
@@ -63,7 +64,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu
index 2562cec6c76..168e2bc0cc5 100644
--- a/cub/benchmarks/bench/select/flagged.cu
+++ b/cub/benchmarks/bench/select/flagged.cu
@@ -29,6 +29,8 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
+
 #include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
@@ -61,7 +63,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu
index a36cf256434..98bdec915a7 100644
--- a/cub/benchmarks/bench/select/if.cu
+++ b/cub/benchmarks/bench/select/if.cu
@@ -29,6 +29,8 @@
 
 #include <thrust/count.h>
 
+#include <cuda/std/__algorithm_>
+
 #include <limits>
 
 #include <look_back_helper.cuh>
@@ -63,7 +65,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu
index 88f469ae953..75c0c3771e0 100644
--- a/cub/benchmarks/bench/select/unique.cu
+++ b/cub/benchmarks/bench/select/unique.cu
@@ -3,6 +3,8 @@
 
 #include <cub/device/device_select.cuh>
 
+#include <cuda/std/__algorithm_>
+
 #include <limits>
 
 #include <look_back_helper.cuh>
@@ -36,8 +38,8 @@ struct policy_hub_t
   {
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
-    static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+    static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
+      NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
index 0e46a47d49d..9f272961ef2 100644
--- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
@@ -46,7 +46,8 @@
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/std/__algorithm/max.h>
+#include <cuda/cmath>
+#include <cuda/std/__algorithm_>
 
 CUB_NAMESPACE_BEGIN
 
@@ -315,9 +316,8 @@ struct policy_hub
     static constexpr int items =
       (max_input_bytes <= 8)
         ? 6
-        // TODO(bgruber): use clamp() and ceil_div in C++14
-        : CUB_MIN(nominal_4B_items_per_thread,
-                  CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
+        : ::cuda::std::clamp(
+            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
     using ReduceByKeyPolicyT =
       AgentReduceByKeyPolicy<128,
                              items,
@@ -603,7 +603,7 @@ struct policy_hub
     static constexpr int nominal_4B_items_per_thread = 15;
     // TODO(bgruber): use clamp() in C++14
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT))));
+      _CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread);
     using RleSweepPolicyT =
       AgentRlePolicy<96,
                      ITEMS_PER_THREAD,
diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
index d3f99ee8ee8..ea2971dd3d2 100644
--- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
@@ -45,6 +45,8 @@
 #include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm_>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -1498,9 +1500,8 @@ struct policy_hub
   struct DefaultPolicy
   {
     static constexpr int nominal_4B_items_per_thread = 10;
-    // TODO(bgruber): use cuda::std::clamp() in C++14
     static constexpr int items_per_thread =
-      CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT))));
+      ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread);
     using SelectIfPolicyT =
       AgentSelectIfPolicy<128,
                           items_per_thread,
diff --git a/thrust/testing/zip_iterator.cu b/thrust/testing/zip_iterator.cu
index 786b839099b..47b8033bfb1 100644
--- a/thrust/testing/zip_iterator.cu
+++ b/thrust/testing/zip_iterator.cu
@@ -332,25 +332,25 @@ struct TestZipIteratorTransform
     device_vector<T> d_result(n);
 
     // Tuples with 2 elements
-    transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
-              make_zip_iterator(h_data0.end(), h_data1.end()),
-              h_result.begin(),
-              SumTwoTuple());
-    transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
-              make_zip_iterator(d_data0.end(), d_data1.end()),
-              d_result.begin(),
-              SumTwoTuple());
+    thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
+                      make_zip_iterator(h_data0.end(), h_data1.end()),
+                      h_result.begin(),
+                      SumTwoTuple());
+    thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
+                      make_zip_iterator(d_data0.end(), d_data1.end()),
+                      d_result.begin(),
+                      SumTwoTuple());
     ASSERT_EQUAL(h_result, d_result);
 
     // Tuples with 3 elements
-    transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
-              make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
-              h_result.begin(),
-              SumThreeTuple());
-    transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
-              make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
-              d_result.begin(),
-              SumThreeTuple());
+    thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
+                      make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
+                      h_result.begin(),
+                      SumThreeTuple());
+    thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
+                      make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
+                      d_result.begin(),
+                      SumThreeTuple());
     ASSERT_EQUAL(h_result, d_result);
   }
 };