From 21ff0a07ab369dc1b7604a8ee3c0e0d74ee3d058 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 21 Jan 2025 11:13:14 +0100
Subject: [PATCH] Deprecate `cub::{min, max}` and replace internal uses with
 those from libcu++ (#3419) (#3447)

Fixes #3404
---
 cub/benchmarks/bench/radix_sort/keys.cu       |  4 ++-
 cub/benchmarks/bench/radix_sort/pairs.cu      |  4 ++-
 cub/cub/agent/agent_merge.cuh                 |  9 ++++---
 cub/cub/agent/agent_merge_sort.cuh            | 27 ++++++++++---------
 cub/cub/agent/agent_spmv_orig.cuh             | 10 ++++---
 cub/cub/block/block_merge_sort.cuh            | 12 +++++----
 cub/cub/block/block_run_length_decode.cuh     |  5 +++-
 cub/cub/block/radix_rank_sort_operations.cuh  |  4 ++-
 cub/cub/detail/temporary_storage.cuh          |  4 ++-
 .../device/dispatch/dispatch_batch_memcpy.cuh |  4 ++-
 cub/cub/device/dispatch/dispatch_merge.cuh    |  5 +++-
 .../device/dispatch/dispatch_merge_sort.cuh   |  9 ++++---
 .../dispatch/dispatch_segmented_sort.cuh      |  8 +++---
 .../device/dispatch/dispatch_transform.cuh    |  6 ++---
 .../dispatch/tuning/tuning_histogram.cuh      |  2 +-
 .../dispatch/tuning/tuning_reduce_by_key.cuh  |  2 +-
 .../tuning/tuning_run_length_encode.cuh       |  2 +-
 .../dispatch/tuning/tuning_scan_by_key.cuh    |  2 +-
 cub/cub/util_macro.cuh                        |  2 ++
 cub/cub/util_math.cuh                         | 20 ++++++++------
 20 files changed, 87 insertions(+), 54 deletions(-)
diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu
index b6b9e4fd537..bd04bcf3d43 100644
--- a/cub/benchmarks/bench/radix_sort/keys.cu
+++ b/cub/benchmarks/bench/radix_sort/keys.cu
@@ -28,6 +28,7 @@
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/util_arch.cuh>
 
+#include <cuda/std/functional>
 #include <cuda/std/type_traits>
 
 #include <nvbench_helper.cuh>
@@ -109,7 +110,8 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
   using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
 
-  return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
+  return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
+                            sizeof(typename hist_agent::TempStorage));
 }
 
 template <typename KeyT, typename ValueT, typename OffsetT>
diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu
index 4a9f229bca4..35d589f453e 100644
--- a/cub/benchmarks/bench/radix_sort/pairs.cu
+++ b/cub/benchmarks/bench/radix_sort/pairs.cu
@@ -28,6 +28,7 @@
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/util_arch.cuh>
 
+#include <cuda/std/functional>
 #include <cuda/std/type_traits>
 
 #include <nvbench_helper.cuh>
@@ -107,7 +108,8 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
   using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
 
-  return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
+  return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
+                            sizeof(typename hist_agent::TempStorage));
 }
 
 template <typename KeyT, typename ValueT, typename OffsetT>
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
index e1649b812ff..5e01932fbc8 100644
--- a/cub/cub/agent/agent_merge.cuh
+++ b/cub/cub/agent/agent_merge.cuh
@@ -22,7 +22,8 @@
 
 #include <thrust/system/cuda/detail/core/util.h>
 
-#include <cuda/std/__cccl/dialect.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -116,7 +117,7 @@ struct agent_t
     const Offset partition_end = merge_partitions[tile_idx + 1];
 
     const Offset diag0 = items_per_tile * tile_idx;
-    const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile);
+    const Offset diag1 = (::cuda::std::min)(keys1_count + keys2_count, diag0 + items_per_tile);
 
     // compute bounding box for keys1 & keys2
     const Offset keys1_beg = partition_beg;
@@ -136,7 +137,7 @@ struct agent_t
 
     // use binary search in shared memory to find merge path for each of thread.
     // we can use int type here, because the number of items in shared memory is limited
-    const int diag0_loc = min<int>(num_keys1 + num_keys2, items_per_thread * threadIdx.x);
+    const int diag0_loc = (::cuda::std::min)(num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));
 
     const int keys1_beg_loc =
       MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
@@ -215,7 +216,7 @@ struct agent_t
     const Offset tile_base = tile_idx * items_per_tile;
     // TODO(bgruber): random mixing of int and Offset
     const int items_in_tile =
-      static_cast<int>(cub::min(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
+      static_cast<int>((::cuda::std::min)(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
     if (items_in_tile == items_per_tile)
     {
       consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index 9c0c54fbeb2..8026c03d56d 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -45,6 +45,9 @@
 
 #include <thrust/system/cuda/detail/core/util.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 CUB_NAMESPACE_BEGIN
 
 template <int _BLOCK_THREADS,
@@ -156,7 +159,7 @@ struct AgentBlockSort
     auto tile_idx     = static_cast<OffsetT>(blockIdx.x);
     auto num_tiles    = static_cast<OffsetT>(gridDim.x);
     auto tile_base    = tile_idx * ITEMS_PER_TILE;
-    int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE});
+    int items_in_tile = (::cuda::std::min)(static_cast<int>(keys_count - tile_base), int{ITEMS_PER_TILE});
 
     if (tile_idx < num_tiles - 1)
     {
@@ -335,10 +338,10 @@ struct AgentPartition
     // partition_idx / target_merged_tiles_number
     const OffsetT local_tile_idx = mask & partition_idx;
 
-    const OffsetT keys1_beg = (cub::min)(keys_count, start);
-    const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size));
+    const OffsetT keys1_beg = (::cuda::std::min)(keys_count, start);
+    const OffsetT keys1_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(start, size));
     const OffsetT keys2_beg = keys1_end;
-    const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
+    const OffsetT keys2_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
 
     _CCCL_PDL_GRID_DEPENDENCY_SYNC();
 
@@ -349,7 +352,7 @@ struct AgentPartition
     }
     else
     {
-      const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
+      const OffsetT partition_at = (::cuda::std::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
 
       OffsetT partition_diag =
         ping
@@ -526,15 +529,15 @@ struct AgentMerge
     // diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2)
     // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile
     // and keys1_end is key1's component of that path
-    const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg);
-    OffsetT keys2_end =
-      (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);
+    const OffsetT keys2_beg = (::cuda::std::min)(max_keys2, diag - keys1_beg);
+    OffsetT keys2_end       = (::cuda::std::min)(
+      max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);
 
     // Check if it's the last tile in the tile group being merged
     if (mask == (mask & tile_idx))
     {
-      keys1_end = (cub::min)(keys_count - start, size);
-      keys2_end = (cub::min)(max_keys2, size);
+      keys1_end = (::cuda::std::min)(keys_count - start, size);
+      keys2_end = (::cuda::std::min)(max_keys2, size);
     }
 
     // number of keys per tile
@@ -591,7 +594,7 @@ struct AgentMerge
     // we can use int type here, because the number of
     // items in shared memory is limited
     //
-    const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
+    const int diag0_local = (::cuda::std::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
 
     const int keys1_beg_local = MergePath(
       &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op);
@@ -731,7 +734,7 @@ struct AgentMerge
     const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
     const int tid           = static_cast<int>(threadIdx.x);
     const int items_in_tile =
-      static_cast<int>((cub::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
+      static_cast<int>((::cuda::std::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
 
     if (tile_idx < num_tiles - 1)
     {
diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
index 12ba663770f..2ad0bee84a6 100644
--- a/cub/cub/agent/agent_spmv_orig.cuh
+++ b/cub/cub/agent/agent_spmv_orig.cuh
@@ -52,6 +52,8 @@
 #include <cub/thread/thread_search.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 #include <iterator>
@@ -367,8 +369,8 @@ struct AgentSpmv
     // Gather the row end-offsets for the merge tile into shared memory
     for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
     {
-      const OffsetT offset =
-        (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
+      const OffsetT offset = (::cuda::std::min)(
+        static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
       s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
     }
 
@@ -548,8 +550,8 @@ struct AgentSpmv
 #pragma unroll 1
     for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
     {
-      const OffsetT offset =
-        (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
+      const OffsetT offset = (::cuda::std::min)(
+        static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
       s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
     }
 
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index 7f931b0d42d..3ade5eb1609 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -43,6 +43,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -58,7 +60,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
 MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred)
 {
   OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
-  OffsetT keys1_end   = (cub::min)(diag, keys1_count);
+  OffsetT keys1_end   = (::cuda::std::min)(diag, keys1_count);
 
   while (keys1_begin < keys1_end)
   {
@@ -425,12 +427,12 @@ public:
 
       int thread_idx_in_thread_group_being_merged = mask & linear_tid;
 
-      int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
+      int diag = (::cuda::std::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
 
-      int keys1_beg = (cub::min)(valid_items, start);
-      int keys1_end = (cub::min)(valid_items, keys1_beg + size);
+      int keys1_beg = (::cuda::std::min)(valid_items, start);
+      int keys1_end = (::cuda::std::min)(valid_items, keys1_beg + size);
       int keys2_beg = keys1_end;
-      int keys2_end = (cub::min)(valid_items, keys2_beg + size);
+      int keys2_end = (::cuda::std::min)(valid_items, keys2_beg + size);
 
       int keys1_count = keys1_end - keys1_beg;
       int keys2_count = keys2_end - keys2_beg;
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index 2138ed31d7e..467d9141dc3 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -44,6 +44,9 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 #include <limits>
 #include <type_traits>
 
@@ -284,7 +287,7 @@ private:
     for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
     {
       OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
-      mid         = (cub::min)(mid, num_items - 1);
+      mid         = (::cuda::std::min)(mid, num_items - 1);
 
       if (val < input[mid])
       {
diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh
index d4fdd9c405f..35bdfe8ee02 100644
--- a/cub/cub/block/radix_rank_sort_operations.cuh
+++ b/cub/cub/block/radix_rank_sort_operations.cuh
@@ -49,6 +49,8 @@
 
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/tuple>
 #include <cuda/std/type_traits>
@@ -437,7 +439,7 @@ struct digit_f
       using traits           = traits_t<typename ::cuda::std::remove_cv<T>::type>;
       using bit_ordered_type = typename traits::bit_ordered_type;
 
-      const ::cuda::std::uint32_t bits_to_copy = min(src_size - src_bit_start, num_bits);
+      const ::cuda::std::uint32_t bits_to_copy = (::cuda::std::min)(src_size - src_bit_start, num_bits);
 
       if (bits_to_copy)
       {
diff --git a/cub/cub/detail/temporary_storage.cuh b/cub/cub/detail/temporary_storage.cuh
index cf5f98e775a..f271ce804a9 100644
--- a/cub/cub/detail/temporary_storage.cuh
+++ b/cub/cub/detail/temporary_storage.cuh
@@ -29,6 +29,8 @@
 #include <cub/util_namespace.cuh>
 #include <cub/util_temporary_storage.cuh>
 
+#include <cuda/std/__algorithm/max.h>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -96,7 +98,7 @@ public:
 private:
   _CCCL_HOST_DEVICE void set_bytes_required(std::size_t new_size)
   {
-    m_size = (max) (m_size, new_size);
+    m_size = (::cuda::std::max)(m_size, new_size);
   }
 
   _CCCL_HOST_DEVICE std::size_t get_bytes_required() const
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 287f702b095..de9e35bd859 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -54,6 +54,8 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 #include <cstdint>
@@ -173,7 +175,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
       copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
         input_buffer_it[buffer_id],
         output_buffer_it[buffer_id],
-        (cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
+        (::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
         tile_offset_within_buffer);
     }
 
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index ff43656c5c5..fa467cdd5fe 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -21,6 +21,9 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+
 CUB_NAMESPACE_BEGIN
 namespace detail
 {
@@ -80,7 +83,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
   const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (partition_idx < num_partitions)
   {
-    const Offset partition_at       = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
+    const Offset partition_at       = (::cuda::std::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
     merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
   }
 }
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 507b7776de6..4a83eeb19c4 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -47,6 +47,8 @@
 #include <thrust/detail/integer_math.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -125,9 +127,10 @@ private:
   // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory
   // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the
   // available shared memory
-  static constexpr auto max_default_size = (cub::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
+  static constexpr auto max_default_size =
+    (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
   static constexpr auto max_fallback_size =
-    (cub::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
+    (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
   static constexpr bool uses_fallback_policy =
     (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block);
 
@@ -472,7 +475,7 @@ struct DispatchMergeSort
        */
       const std::size_t block_sort_smem_size       = num_tiles * BlockSortVSmemHelperT::vsmem_per_block;
       const std::size_t merge_smem_size            = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block;
-      const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size);
+      const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);
 
       void* allocations[4]            = {nullptr, nullptr, nullptr, nullptr};
       std::size_t allocation_sizes[4] = {
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index e96511ffa41..15ea587390b 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -187,7 +187,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   {
     // Sort by a CTA with multiple reads from global memory
     int current_bit = begin_bit;
-    int pass_bits   = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+    int pass_bits   = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
     d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
       d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
@@ -210,7 +210,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
 #pragma unroll 1
     while (current_bit < end_bit)
     {
-      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+      pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
       __syncthreads();
       agent.ProcessIterative(
@@ -461,7 +461,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   {
     // Sort reading global memory multiple times
     int current_bit = begin_bit;
-    int pass_bits   = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+    int pass_bits   = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
     d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
       d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
@@ -484,7 +484,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
 #pragma unroll 1
     while (current_bit < end_bit)
     {
-      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+      pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
 
       __syncthreads();
       agent.ProcessIterative(
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index 386a6276dfa..ed2e104746e 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -118,7 +118,7 @@ _CCCL_DEVICE void transform_kernel_impl(
   constexpr int block_dim = PrefetchPolicy::block_threads;
   const int tile_stride   = block_dim * num_elem_per_thread;
   const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
-  const int tile_size     = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_stride}));
+  const int tile_size     = static_cast<int>((::cuda::std::min)(num_items - offset, Offset{tile_stride}));
 
   // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
   {
@@ -330,7 +330,7 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   constexpr int block_dim = BulkCopyPolicy::block_threads;
   const int tile_stride   = block_dim * num_elem_per_thread;
   const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
-  const int tile_size     = ::cuda::std::min(num_items - offset, Offset{tile_stride});
+  const int tile_size     = (::cuda::std::min)(num_items - offset, Offset{tile_stride});
 
   const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
   if (inner_blocks)
@@ -813,7 +813,7 @@ struct dispatch_t<RequiresStableAddress,
 
     // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16 or 2^20 elements
     const int items_per_thread_evenly_spread = static_cast<int>(
-      ::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy)));
+      (::cuda::std::min)(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy)));
 
     const int items_per_thread_clamped = ::cuda::std::clamp(
       items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
index 3932ac74c68..1a06c25cb92 100644
--- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
@@ -133,7 +133,7 @@ struct policy_hub
 
   static constexpr int t_scale(int nominalItemsPerThread)
   {
-    return ::cuda::std::max(nominalItemsPerThread / NumActiveChannels / v_scale, 1);
+    return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1);
   }
 
   // SM35
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index 02bfb443fc1..41fbb2c49a4 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -610,7 +610,7 @@ struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive
 template <class ReductionOpT, class AccumT, class KeyT>
 struct policy_hub
 {
-  static constexpr int max_input_bytes      = static_cast<int>(::cuda::std::max(sizeof(KeyT), sizeof(AccumT)));
+  static constexpr int max_input_bytes      = static_cast<int>((::cuda::std::max)(sizeof(KeyT), sizeof(AccumT)));
   static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT);
 
   template <CacheLoadModifier LoadModifier>
diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
index 33771f6882f..87631d1199e 100644
--- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
@@ -236,7 +236,7 @@ struct sm90_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 template <class LengthT, class KeyT>
 struct policy_hub
 {
-  static constexpr int max_input_bytes      = static_cast<int>(::cuda::std::max(sizeof(KeyT), sizeof(LengthT)));
+  static constexpr int max_input_bytes      = static_cast<int>((::cuda::std::max)(sizeof(KeyT), sizeof(LengthT)));
   static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT);
 
   template <CacheLoadModifier LoadModifier>
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
index fc8add23a22..b3eaa4e513c 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
@@ -714,7 +714,7 @@ template <typename KeysInputIteratorT, typename AccumT, typename ValueT, typenam
 struct policy_hub
 {
   using key_t                               = value_t<KeysInputIteratorT>;
-  static constexpr int max_input_bytes      = static_cast<int>(::cuda::std::max(sizeof(key_t), sizeof(AccumT)));
+  static constexpr int max_input_bytes      = static_cast<int>((::cuda::std::max)(sizeof(key_t), sizeof(AccumT)));
   static constexpr int combined_input_bytes = static_cast<int>(sizeof(key_t) + sizeof(AccumT));
 
   struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh
index ae42e5fe875..50ae43f8ab7 100644
--- a/cub/cub/util_macro.cuh
+++ b/cub/cub/util_macro.cuh
@@ -52,6 +52,7 @@ CUB_NAMESPACE_BEGIN
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 #  define CUB_PREVENT_MACRO_SUBSTITUTION
 template <typename T, typename U>
+CCCL_DEPRECATED_BECAUSE("Use cuda::std::min from <cuda/std/functional> instead")
 constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u)
   -> decltype(t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u))
 {
@@ -59,6 +60,7 @@ constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u
 }
 
 template <typename T, typename U>
+CCCL_DEPRECATED_BECAUSE("Use cuda::std::max from <cuda/std/functional> instead")
 constexpr _CCCL_HOST_DEVICE auto max CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u)
   -> decltype(t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t))
 {
diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh
index 60c81364b03..b6d203c7f7a 100644
--- a/cub/cub/util_math.cuh
+++ b/cub/cub/util_math.cuh
@@ -43,6 +43,8 @@
 #endif // no system header
 
 #include <cuda/cmath>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -66,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O
 {
   static_assert(::cuda::std::is_integral<OffsetT>::value, "OffsetT must be an integral type");
   static_assert(sizeof(OffsetT) >= 4, "OffsetT must be at least 32 bits in size");
-  auto const capped_operand_rhs = (cub::min)(rhs, ::cuda::std::numeric_limits<OffsetT>::max() - lhs);
+  auto const capped_operand_rhs = (::cuda::std::min)(rhs, ::cuda::std::numeric_limits<OffsetT>::max() - lhs);
   return lhs + capped_operand_rhs;
 }
 
@@ -91,14 +93,15 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr NumeratorT DivideAndRoundUp(Numera
 
 constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes)
 {
-  return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes));
+  return (::cuda::std::min)(nominal_4b_items_per_thread,
+                            (::cuda::std::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes));
 }
 
 template <typename T>
 constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItems(int nominal_4b_items_per_thread)
 {
-  return (cub::min)(nominal_4b_items_per_thread,
-                    (cub::max)(1, nominal_4b_items_per_thread * 4 / static_cast<int>(sizeof(T))));
+  return (::cuda::std::min)(nominal_4b_items_per_thread,
+                            (::cuda::std::max)(1, nominal_4b_items_per_thread * 4 / static_cast<int>(sizeof(T))));
 }
 
 template <typename ItemT>
@@ -106,10 +109,11 @@ constexpr _CCCL_HOST_DEVICE int Nominal8BItemsToItems(int nominal_8b_items_per_t
 {
   return sizeof(ItemT) <= 8u
          ? nominal_8b_items_per_thread
-         : (cub::min)(nominal_8b_items_per_thread,
-                      (cub::max)(1,
-                                 ((nominal_8b_items_per_thread * 8) + static_cast<int>(sizeof(ItemT)) - 1)
-                                   / static_cast<int>(sizeof(ItemT))));
+         : (::cuda::std::min)(
+             nominal_8b_items_per_thread,
+             (::cuda::std::max)(1,
+                                ((nominal_8b_items_per_thread * 8) + static_cast<int>(sizeof(ItemT)) - 1)
+                                  / static_cast<int>(sizeof(ItemT))));
 }
 
 /**