From 21ff0a07ab369dc1b7604a8ee3c0e0d74ee3d058 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 21 Jan 2025 11:13:14 +0100 Subject: [PATCH] Deprecate `cub::{min, max}` and replace internal uses with those from libcu++ (#3419) (#3447) Fixes #3404 --- cub/benchmarks/bench/radix_sort/keys.cu | 4 ++- cub/benchmarks/bench/radix_sort/pairs.cu | 4 ++- cub/cub/agent/agent_merge.cuh | 9 ++++--- cub/cub/agent/agent_merge_sort.cuh | 27 ++++++++++--------- cub/cub/agent/agent_spmv_orig.cuh | 10 ++++--- cub/cub/block/block_merge_sort.cuh | 12 +++++---- cub/cub/block/block_run_length_decode.cuh | 5 +++- cub/cub/block/radix_rank_sort_operations.cuh | 4 ++- cub/cub/detail/temporary_storage.cuh | 4 ++- .../device/dispatch/dispatch_batch_memcpy.cuh | 4 ++- cub/cub/device/dispatch/dispatch_merge.cuh | 5 +++- .../device/dispatch/dispatch_merge_sort.cuh | 9 ++++--- .../dispatch/dispatch_segmented_sort.cuh | 8 +++--- .../device/dispatch/dispatch_transform.cuh | 6 ++--- .../dispatch/tuning/tuning_histogram.cuh | 2 +- .../dispatch/tuning/tuning_reduce_by_key.cuh | 2 +- .../tuning/tuning_run_length_encode.cuh | 2 +- .../dispatch/tuning/tuning_scan_by_key.cuh | 2 +- cub/cub/util_macro.cuh | 2 ++ cub/cub/util_math.cuh | 20 ++++++++------ 20 files changed, 87 insertions(+), 54 deletions(-) diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu index b6b9e4fd537..bd04bcf3d43 100644 --- a/cub/benchmarks/bench/radix_sort/keys.cu +++ b/cub/benchmarks/bench/radix_sort/keys.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -109,7 +110,8 @@ constexpr std::size_t max_onesweep_temp_storage_size() using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; using hist_agent = cub::AgentRadixSortHistogram; - return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); + return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), + sizeof(typename hist_agent::TempStorage)); } template diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu index 4a9f229bca4..35d589f453e 100644 --- a/cub/benchmarks/bench/radix_sort/pairs.cu +++ b/cub/benchmarks/bench/radix_sort/pairs.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -107,7 +108,8 @@ constexpr std::size_t max_onesweep_temp_storage_size() using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; using hist_agent = cub::AgentRadixSortHistogram; - return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); + return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), + sizeof(typename hist_agent::TempStorage)); } template diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh index e1649b812ff..5e01932fbc8 100644 --- a/cub/cub/agent/agent_merge.cuh +++ b/cub/cub/agent/agent_merge.cuh @@ -22,7 +22,8 @@ #include -#include +#include +#include CUB_NAMESPACE_BEGIN namespace detail @@ -116,7 +117,7 @@ struct agent_t const Offset partition_end = merge_partitions[tile_idx + 1]; const Offset diag0 = items_per_tile * tile_idx; - const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile); + const Offset diag1 = (::cuda::std::min)(keys1_count + keys2_count, diag0 + items_per_tile); // compute bounding box for keys1 & keys2 const Offset keys1_beg = partition_beg; @@ -136,7 +137,7 @@ struct agent_t // use binary search in shared memory to find merge path for each of thread. // we can use int type here, because the number of items in shared memory is limited - const int diag0_loc = min(num_keys1 + num_keys2, items_per_thread * threadIdx.x); + const int diag0_loc = (::cuda::std::min)(num_keys1 + num_keys2, static_cast(items_per_thread * threadIdx.x)); const int keys1_beg_loc = MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op); @@ -215,7 +216,7 @@ struct agent_t const Offset tile_base = tile_idx * items_per_tile; // TODO(bgruber): random mixing of int and Offset const int items_in_tile = - static_cast(cub::min(static_cast(items_per_tile), keys1_count + keys2_count - tile_base)); + static_cast((::cuda::std::min)(static_cast(items_per_tile), keys1_count + keys2_count - tile_base)); if (items_in_tile == items_per_tile) { consume_tile(tile_idx, tile_base, items_per_tile); // full tile diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index 9c0c54fbeb2..8026c03d56d 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -45,6 +45,9 @@ #include +#include +#include + CUB_NAMESPACE_BEGIN template (blockIdx.x); auto num_tiles = static_cast(gridDim.x); auto tile_base = tile_idx * ITEMS_PER_TILE; - int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE}); + int items_in_tile = (::cuda::std::min)(static_cast(keys_count - tile_base), int{ITEMS_PER_TILE}); if (tile_idx < num_tiles - 1) { @@ -335,10 +338,10 @@ struct AgentPartition // partition_idx / target_merged_tiles_number const OffsetT local_tile_idx = mask & partition_idx; - const OffsetT keys1_beg = (cub::min)(keys_count, start); - const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size)); + const OffsetT keys1_beg = (::cuda::std::min)(keys_count, start); + const OffsetT keys1_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(start, size)); const OffsetT keys2_beg = keys1_end; - const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size)); + const OffsetT keys2_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size)); _CCCL_PDL_GRID_DEPENDENCY_SYNC(); @@ -349,7 +352,7 @@ struct AgentPartition } else { - const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); + const OffsetT partition_at = (::cuda::std::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); OffsetT partition_diag = ping @@ -526,15 +529,15 @@ struct AgentMerge // diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2) // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile // and keys1_end is key1's component of that path - const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg); - OffsetT keys2_end = - (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast(ITEMS_PER_TILE)) - keys1_end); + const OffsetT keys2_beg = (::cuda::std::min)(max_keys2, diag - keys1_beg); + OffsetT keys2_end = (::cuda::std::min)( + max_keys2, detail::safe_add_bound_to_max(diag, static_cast(ITEMS_PER_TILE)) - keys1_end); // Check if it's the last tile in the tile group being merged if (mask == (mask & tile_idx)) { - keys1_end = (cub::min)(keys_count - start, size); - keys2_end = (cub::min)(max_keys2, size); + keys1_end = (::cuda::std::min)(keys_count - start, size); + keys2_end = (::cuda::std::min)(max_keys2, size); } // number of keys per tile @@ -591,7 +594,7 @@ struct AgentMerge // we can use int type here, because the number of // items in shared memory is limited // - const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); + const int diag0_local = (::cuda::std::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); const int keys1_beg_local = MergePath( &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op); @@ -731,7 +734,7 @@ struct AgentMerge const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE; const int tid = static_cast(threadIdx.x); const int items_in_tile = - static_cast((cub::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); + static_cast((::cuda::std::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); if (tile_idx < num_tiles - 1) { diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh index 12ba663770f..2ad0bee84a6 100644 --- a/cub/cub/agent/agent_spmv_orig.cuh +++ b/cub/cub/agent/agent_spmv_orig.cuh @@ -52,6 +52,8 @@ #include #include +#include +#include #include #include @@ -367,8 +369,8 @@ struct AgentSpmv // Gather the row end-offsets for the merge tile into shared memory for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { - const OffsetT offset = - (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); + const OffsetT offset = (::cuda::std::min)( + static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } @@ -548,8 +550,8 @@ struct AgentSpmv #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { - const OffsetT offset = - (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); + const OffsetT offset = (::cuda::std::min)( + static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index 7f931b0d42d..3ade5eb1609 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -43,6 +43,8 @@ #include #include +#include +#include #include CUB_NAMESPACE_BEGIN @@ -58,7 +60,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred) { OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count; - OffsetT keys1_end = (cub::min)(diag, keys1_count); + OffsetT keys1_end = (::cuda::std::min)(diag, keys1_count); while (keys1_begin < keys1_end) { @@ -425,12 +427,12 @@ public: int thread_idx_in_thread_group_being_merged = mask & linear_tid; - int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged); + int diag = (::cuda::std::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged); - int keys1_beg = (cub::min)(valid_items, start); - int keys1_end = (cub::min)(valid_items, keys1_beg + size); + int keys1_beg = (::cuda::std::min)(valid_items, start); + int keys1_end = (::cuda::std::min)(valid_items, keys1_beg + size); int keys2_beg = keys1_end; - int keys2_end = (cub::min)(valid_items, keys2_beg + size); + int keys2_end = (::cuda::std::min)(valid_items, keys2_beg + size); int keys1_count = keys1_end - keys1_beg; int keys2_count = keys2_end - keys2_beg; diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index 2138ed31d7e..467d9141dc3 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -44,6 +44,9 @@ #include #include +#include +#include + #include #include @@ -284,7 +287,7 @@ private: for (int i = 0; i <= Log2::VALUE; i++) { OffsetT mid = cub::MidPoint(lower_bound, upper_bound); - mid = (cub::min)(mid, num_items - 1); + mid = (::cuda::std::min)(mid, num_items - 1); if (val < input[mid]) { diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh index d4fdd9c405f..35bdfe8ee02 100644 --- a/cub/cub/block/radix_rank_sort_operations.cuh +++ b/cub/cub/block/radix_rank_sort_operations.cuh @@ -49,6 +49,8 @@ #include +#include +#include #include #include #include @@ -437,7 +439,7 @@ struct digit_f using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; - const ::cuda::std::uint32_t bits_to_copy = min(src_size - src_bit_start, num_bits); + const ::cuda::std::uint32_t bits_to_copy = (::cuda::std::min)(src_size - src_bit_start, num_bits); if (bits_to_copy) { diff --git a/cub/cub/detail/temporary_storage.cuh b/cub/cub/detail/temporary_storage.cuh index cf5f98e775a..f271ce804a9 100644 --- a/cub/cub/detail/temporary_storage.cuh +++ b/cub/cub/detail/temporary_storage.cuh @@ -29,6 +29,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -96,7 +98,7 @@ public: private: _CCCL_HOST_DEVICE void set_bytes_required(std::size_t new_size) { - m_size = (max) (m_size, new_size); + m_size = (::cuda::std::max)(m_size, new_size); } _CCCL_HOST_DEVICE std::size_t get_bytes_required() const diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 287f702b095..de9e35bd859 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -54,6 +54,8 @@ #include +#include +#include #include #include @@ -173,7 +175,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO copy_items( input_buffer_it[buffer_id], output_buffer_it[buffer_id], - (cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE), + (::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE), tile_offset_within_buffer); } diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index ff43656c5c5..fa467cdd5fe 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -21,6 +21,9 @@ #include +#include +#include + CUB_NAMESPACE_BEGIN namespace detail { @@ -80,7 +83,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel( const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x; if (partition_idx < num_partitions) { - const Offset partition_at = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count); + const Offset partition_at = (::cuda::std::min)(partition_idx * items_per_tile, keys1_count + keys2_count); merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op); } } diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 507b7776de6..4a83eeb19c4 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -47,6 +47,8 @@ #include #include +#include +#include #include CUB_NAMESPACE_BEGIN @@ -125,9 +127,10 @@ private: // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the // available shared memory - static constexpr auto max_default_size = (cub::max)(block_sort_helper_t::default_size, merge_helper_t::default_size); + static constexpr auto max_default_size = + (::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size); static constexpr auto max_fallback_size = - (cub::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size); + (::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size); static constexpr bool uses_fallback_policy = (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block); @@ -472,7 +475,7 @@ struct DispatchMergeSort */ const std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block; const std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block; - const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size); + const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size); void* allocations[4] = {nullptr, nullptr, nullptr, nullptr}; std::size_t allocation_sizes[4] = { diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index e96511ffa41..15ea587390b 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -187,7 +187,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD { // Sort by a CTA with multiple reads from global memory int current_bit = begin_bit; - int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + int pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); d_keys_double_buffer = cub::detail::device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); @@ -210,7 +210,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD #pragma unroll 1 while (current_bit < end_bit) { - pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); __syncthreads(); agent.ProcessIterative( @@ -461,7 +461,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD { // Sort reading global memory multiple times int current_bit = begin_bit; - int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + int pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); d_keys_double_buffer = cub::detail::device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); @@ -484,7 +484,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD #pragma unroll 1 while (current_bit < end_bit) { - pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); + pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); __syncthreads(); agent.ProcessIterative( diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 386a6276dfa..ed2e104746e 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -118,7 +118,7 @@ _CCCL_DEVICE void transform_kernel_impl( constexpr int block_dim = PrefetchPolicy::block_threads; const int tile_stride = block_dim * num_elem_per_thread; const Offset offset = static_cast(blockIdx.x) * tile_stride; - const int tile_size = static_cast(::cuda::std::min(num_items - offset, Offset{tile_stride})); + const int tile_size = static_cast((::cuda::std::min)(num_items - offset, Offset{tile_stride})); // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below { @@ -330,7 +330,7 @@ _CCCL_DEVICE void transform_kernel_ublkcp( constexpr int block_dim = BulkCopyPolicy::block_threads; const int tile_stride = block_dim * num_elem_per_thread; const Offset offset = static_cast(blockIdx.x) * tile_stride; - const int tile_size = ::cuda::std::min(num_items - offset, Offset{tile_stride}); + const int tile_size = (::cuda::std::min)(num_items - offset, Offset{tile_stride}); const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x; if (inner_blocks) @@ -813,7 +813,7 @@ struct dispatch_t( - ::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy))); + (::cuda::std::min)(Offset{items_per_thread}, num_items / (config->sm_count * block_dim * config->max_occupancy))); const int items_per_thread_clamped = ::cuda::std::clamp( items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread); diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 3932ac74c68..1a06c25cb92 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -133,7 +133,7 @@ struct policy_hub static constexpr int t_scale(int nominalItemsPerThread) { - return ::cuda::std::max(nominalItemsPerThread / NumActiveChannels / v_scale, 1); + return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1); } // SM35 diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 02bfb443fc1..41fbb2c49a4 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -610,7 +610,7 @@ struct sm90_tuning struct policy_hub { - static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(KeyT), sizeof(AccumT))); + static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(KeyT), sizeof(AccumT))); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT); template diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 33771f6882f..87631d1199e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -236,7 +236,7 @@ struct sm90_tuning struct policy_hub { - static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(KeyT), sizeof(LengthT))); + static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(KeyT), sizeof(LengthT))); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT); template diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index fc8add23a22..b3eaa4e513c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -714,7 +714,7 @@ template ; - static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(key_t), sizeof(AccumT))); + static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(key_t), sizeof(AccumT))); static constexpr int combined_input_bytes = static_cast(sizeof(key_t) + sizeof(AccumT)); struct Policy350 : ChainedPolicy<350, Policy350, Policy350> diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index ae42e5fe875..50ae43f8ab7 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -52,6 +52,7 @@ CUB_NAMESPACE_BEGIN #ifndef _CCCL_DOXYGEN_INVOKED // Do not document # define CUB_PREVENT_MACRO_SUBSTITUTION template +CCCL_DEPRECATED_BECAUSE("Use cuda::std::min from instead") constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) -> decltype(t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u)) { @@ -59,6 +60,7 @@ constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u } template +CCCL_DEPRECATED_BECAUSE("Use cuda::std::max from instead") constexpr _CCCL_HOST_DEVICE auto max CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) -> decltype(t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t)) { diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh index 60c81364b03..b6d203c7f7a 100644 --- a/cub/cub/util_math.cuh +++ b/cub/cub/util_math.cuh @@ -43,6 +43,8 @@ #endif // no system header #include +#include +#include #include CUB_NAMESPACE_BEGIN @@ -66,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O { static_assert(::cuda::std::is_integral::value, "OffsetT must be an integral type"); static_assert(sizeof(OffsetT) >= 4, "OffsetT must be at least 32 bits in size"); - auto const capped_operand_rhs = (cub::min)(rhs, ::cuda::std::numeric_limits::max() - lhs); + auto const capped_operand_rhs = (::cuda::std::min)(rhs, ::cuda::std::numeric_limits::max() - lhs); return lhs + capped_operand_rhs; } @@ -91,14 +93,15 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr NumeratorT DivideAndRoundUp(Numera constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) { - return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes)); + return (::cuda::std::min)(nominal_4b_items_per_thread, + (::cuda::std::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes)); } template constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItems(int nominal_4b_items_per_thread) { - return (cub::min)(nominal_4b_items_per_thread, - (cub::max)(1, nominal_4b_items_per_thread * 4 / static_cast(sizeof(T)))); + return (::cuda::std::min)(nominal_4b_items_per_thread, + (::cuda::std::max)(1, nominal_4b_items_per_thread * 4 / static_cast(sizeof(T)))); } template @@ -106,10 +109,11 @@ constexpr _CCCL_HOST_DEVICE int Nominal8BItemsToItems(int nominal_8b_items_per_t { return sizeof(ItemT) <= 8u ? nominal_8b_items_per_thread - : (cub::min)(nominal_8b_items_per_thread, - (cub::max)(1, - ((nominal_8b_items_per_thread * 8) + static_cast(sizeof(ItemT)) - 1) - / static_cast(sizeof(ItemT)))); + : (::cuda::std::min)( + nominal_8b_items_per_thread, + (::cuda::std::max)(1, + ((nominal_8b_items_per_thread * 8) + static_cast(sizeof(ItemT)) - 1) + / static_cast(sizeof(ItemT)))); } /**