diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 383cfc0a895..4b458022c33 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -14,6 +14,7 @@ #include #include +#include #include #include // ::cuda::std::identity #include @@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type) auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain); // Implement part of MemBoundScaling - items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2)); - block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); + items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2); + block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); return {block_size, items_per_thread, vector_load_length}; } diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh index 6767850b373..de7f57158f6 100644 --- a/c2h/include/c2h/bfloat16.cuh +++ b/c2h/include/c2h/bfloat16.cuh @@ -266,12 +266,10 @@ public: }; _LIBCUDACXX_END_NAMESPACE_STD -_CCCL_SUPPRESS_DEPRECATED_PUSH template <> struct CUB_NS_QUALIFIER::NumericTraits - : CUB_NS_QUALIFIER::BaseTraits + : CUB_NS_QUALIFIER::BaseTraits {}; -_CCCL_SUPPRESS_DEPRECATED_POP #ifdef __GNUC__ # pragma GCC diagnostic pop diff --git a/c2h/include/c2h/fill_striped.h b/c2h/include/c2h/fill_striped.h index eab05478c88..a05b19f4bdf 100644 --- a/c2h/include/c2h/fill_striped.h +++ b/c2h/include/c2h/fill_striped.h @@ -139,7 +139,7 @@ struct scalar_to_vec_t template void fill_striped(IteratorT it) { - using T = cub::detail::value_t; + using T = cub::detail::it_value_t; constexpr int warps_in_block = BlockThreads / LogicalWarpThreads; constexpr int items_per_warp = LogicalWarpThreads * ItemsPerThread; diff --git a/c2h/include/c2h/half.cuh b/c2h/include/c2h/half.cuh index 4a30202fe3a..b29f3104a84 100644 --- a/c2h/include/c2h/half.cuh +++ b/c2h/include/c2h/half.cuh @@ -361,11 +361,10 @@ public: }; _LIBCUDACXX_END_NAMESPACE_STD -_CCCL_SUPPRESS_DEPRECATED_PUSH template <> -struct CUB_NS_QUALIFIER::NumericTraits : CUB_NS_QUALIFIER::BaseTraits +struct CUB_NS_QUALIFIER::NumericTraits + : CUB_NS_QUALIFIER::BaseTraits {}; -_CCCL_SUPPRESS_DEPRECATED_POP #ifdef __GNUC__ # pragma GCC diagnostic pop diff --git a/c2h/include/c2h/test_util_vec.h b/c2h/include/c2h/test_util_vec.h index 42e5a33ef7e..01022be9777 100644 --- a/c2h/include/c2h/test_util_vec.h +++ b/c2h/include/c2h/test_util_vec.h @@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long) C2H_VEC_OVERLOAD(float, float) C2H_VEC_OVERLOAD(double, double) -// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types. +// Specialize cuda::std::numeric_limits for vector types. # define REPEAT_TO_LIST_1(a) a # define REPEAT_TO_LIST_2(a) a, a @@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double) # define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a) # define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N) \ - CUB_NAMESPACE_BEGIN \ - template <> \ - struct NumericTraits \ - { \ - static __host__ __device__ T Max() \ - { \ - T retval = {REPEAT_TO_LIST(N, NumericTraits::Max())}; \ - return retval; \ - } \ - static __host__ __device__ T Lowest() \ - { \ - T retval = {REPEAT_TO_LIST(N, NumericTraits::Lowest())}; \ - return retval; \ - } \ - }; \ - CUB_NAMESPACE_END \ - \ _LIBCUDACXX_BEGIN_NAMESPACE_STD \ template <> \ class numeric_limits \ diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index 7217ee32e6e..0a41f88f1a4 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -29,6 +29,7 @@ #include +#include #include #include @@ -63,7 +64,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include #include #include @@ -63,7 +64,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy #include +#include #include #include @@ -57,7 +58,9 @@ struct policy_hub_t // Type used for the final result using output_tuple_t = cub::KeyValuePair; - auto const init = ::cuda::std::is_same_v ? cub::Traits::Max() : cub::Traits::Lowest(); + auto const init = ::cuda::std::is_same_v + ? ::cuda::std::numeric_limits::max() + : ::cuda::std::numeric_limits::lowest(); #if !TUNE_BASE using policy_t = policy_hub_t; diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu index 2562cec6c76..168e2bc0cc5 100644 --- a/cub/benchmarks/bench/select/flagged.cu +++ b/cub/benchmarks/bench/select/flagged.cu @@ -29,6 +29,8 @@ #include +#include + #include #include @@ -61,7 +63,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -63,7 +65,7 @@ struct policy_hub_t static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD); using SelectIfPolicyT = cub::AgentSelectIfPolicy +#include + #include #include @@ -36,8 +38,8 @@ struct policy_hub_t { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; - static constexpr int ITEMS_PER_THREAD = - CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); + static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min( + NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = cub::AgentSelectIfPolicy, - std::iterator_traits>>::type::value_type; + using AliasT = typename ::cuda::std:: + conditional_t, lazy_trait>>::type; /// Types of the input and output buffers - using InputBufferT = cub::detail::value_t; - using OutputBufferT = cub::detail::value_t; + using InputBufferT = it_value_t; + using OutputBufferT = it_value_t; /// Type that has to be sufficiently large to hold any of the buffers' sizes. /// The BufferSizeIteratorT's value type must be convertible to this type. - using BufferSizeT = cub::detail::value_t; + using BufferSizeT = it_value_t; /// Type used to index into the tile of buffers that this thread block is assigned to. using BlockBufferOffsetT = uint16_t; diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index e116b332772..2b5463432da 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -189,7 +189,7 @@ struct AgentHistogram //--------------------------------------------------------------------- /// The sample type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; /// The pixel type of SampleT using PixelT = typename CubVector::Type; diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh index b875fe8cdab..297e800c8b0 100644 --- a/cub/cub/agent/agent_merge.cuh +++ b/cub/cub/agent/agent_merge.cuh @@ -61,8 +61,8 @@ struct agent_t using policy = Policy; // key and value type are taken from the first input sequence (consistent with old Thrust behavior) - using key_type = typename ::cuda::std::iterator_traits::value_type; - using item_type = typename ::cuda::std::iterator_traits::value_type; + using key_type = it_value_t; + using item_type = it_value_t; using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh index e2cc5ecac61..96825d581c0 100644 --- a/cub/cub/agent/agent_radix_sort_histogram.cuh +++ b/cub/cub/agent/agent_radix_sort_histogram.cuh @@ -51,6 +51,7 @@ #include #include +#include CUB_NAMESPACE_BEGIN @@ -66,7 +67,7 @@ struct AgentRadixSortHistogramPolicy * ID. However, lanes with the same ID in different warp use the same private * histogram. This arrangement helps reduce the degree of conflicts in atomic * operations. */ - NUM_PARTS = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)), + NUM_PARTS = _CUDA_VSTD::max(1, NOMINAL_4B_NUM_PARTS * 4 / _CUDA_VSTD::max(int{sizeof(ComputeT)}, 4)), RADIX_BITS = _RADIX_BITS, }; }; @@ -94,16 +95,13 @@ template ; using bit_ordered_type = typename traits::bit_ordered_type; @@ -210,7 +208,9 @@ struct AgentRadixSortHistogram #pragma unroll for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { - int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit); + // FIXME(bgruber): the following replacement changes SASS for cub.test.device_radix_sort_pairs.lid_0 + // const int num_bits = _CUDA_VSTD::min(+RADIX_BITS, end_bit - current_bit); + const int num_bits = CUB_MIN(+RADIX_BITS, end_bit - current_bit); #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { @@ -258,7 +258,7 @@ struct AgentRadixSortHistogram // Process the tiles. OffsetT portion_offset = portion * MAX_PORTION_SIZE; - OffsetT portion_size = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset); + OffsetT portion_size = _CUDA_VSTD::min(MAX_PORTION_SIZE, num_items - portion_offset); for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x) { OffsetT tile_offset = portion_offset + offset; diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh index c8fa2ed841e..da34a0b4e75 100644 --- a/cub/cub/agent/agent_radix_sort_upsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh @@ -53,6 +53,7 @@ #include #include +#include CUB_NAMESPACE_BEGIN @@ -160,17 +161,17 @@ struct AgentRadixSortUpsweep PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), LOG_PACKING_RATIO = Log2::VALUE, - LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)), + LOG_COUNTER_LANES = _CUDA_VSTD::max(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)), COUNTER_LANES = 1 << LOG_COUNTER_LANES, // To prevent counter overflow, we must periodically unpack and aggregate the // digit counters back into registers. Each counter lane is assigned to a // warp for aggregation. - LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), + LANES_PER_WARP = _CUDA_VSTD::max(1, (COUNTER_LANES + WARPS - 1) / WARPS), // Unroll tiles in batches without risk of counter overflow - UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), + UNROLL_COUNT = _CUDA_VSTD::min(64, 255 / KEYS_PER_THREAD), UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, }; diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index e6610e52c14..bd8c8bbf483 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -142,7 +142,7 @@ struct AgentReduce //--------------------------------------------------------------------- /// The input value type - using InputT = value_t; + using InputT = it_value_t; /// Vector type of InputT for data movement using VectorT = typename CubVector::Type; @@ -159,7 +159,7 @@ struct AgentReduce static constexpr int BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD; static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; - static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH); + static constexpr int VECTOR_LOAD_LENGTH = _CUDA_VSTD::min(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH); // Can vectorize according to the policy if the input iterator is a native // pointer to a primitive type diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index 381fa12577f..6852877ec17 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -171,13 +171,13 @@ struct AgentReduceByKey //--------------------------------------------------------------------- // The input keys type - using KeyInputT = value_t; + using KeyInputT = it_value_t; // The output keys type using KeyOutputT = non_void_value_t; // The input values type - using ValueInputT = value_t; + using ValueInputT = it_value_t; // Tuple type for scanning (pairs accumulated segment-value with // segment-index) diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh index 76fd9f24ed3..fefda393aff 100644 --- a/cub/cub/agent/agent_rle.cuh +++ b/cub/cub/agent/agent_rle.cuh @@ -173,7 +173,7 @@ struct AgentRle //--------------------------------------------------------------------- /// The input value type - using T = cub::detail::value_t; + using T = cub::detail::it_value_t; /// The lengths output value type using LengthT = cub::detail::non_void_value_t; diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh index e59fea2ec2e..995ef66811e 100644 --- a/cub/cub/agent/agent_scan.cuh +++ b/cub/cub/agent/agent_scan.cuh @@ -157,7 +157,7 @@ struct AgentScan //--------------------------------------------------------------------- // The input value type - using InputT = cub::detail::value_t; + using InputT = cub::detail::it_value_t; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh index a3fcff75464..1ac367742b2 100644 --- a/cub/cub/agent/agent_scan_by_key.cuh +++ b/cub/cub/agent/agent_scan_by_key.cuh @@ -145,8 +145,8 @@ struct AgentScanByKey // Types and constants //--------------------------------------------------------------------- - using KeyT = value_t; - using InputT = value_t; + using KeyT = it_value_t; + using InputT = it_value_t; using FlagValuePairT = KeyValuePair; using ReduceBySegmentOpT = ScanBySegmentOp; diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 048737382e8..a142e274d2c 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -227,10 +227,10 @@ struct AgentSelectIf using MemoryOrderedTileStateT = tile_state_with_memory_order; // The input value type - using InputT = value_t; + using InputT = it_value_t; // The flag value type - using FlagT = value_t; + using FlagT = it_value_t; // Constants enum diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index 9f2a3afeb62..843167526d4 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -180,7 +180,7 @@ struct AgentThreeWayPartition //--------------------------------------------------------------------- // The input value type - using InputT = value_t; + using InputT = it_value_t; using AccumPackHelperT = accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index dfeaaec50a7..1c8a75face9 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -133,8 +133,8 @@ struct AgentUniqueByKey //--------------------------------------------------------------------- // The input key and value type - using KeyT = cub::detail::value_t; - using ValueT = cub::detail::value_t; + using KeyT = cub::detail::it_value_t; + using ValueT = cub::detail::it_value_t; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index 017f0b5ef3f..3bea48e02e0 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -1250,7 +1250,7 @@ public: //! @} end member group }; -template > +template > struct BlockLoadType { using type = cub::BlockLoad; diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index 33eb7de6e16..d93b6033084 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -66,8 +66,8 @@ MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, { const OffsetT mid = cub::MidPoint(keys1_begin, keys1_end); // pull copies of the keys before calling binary_pred so proxy references are unwrapped - const detail::value_t key1 = keys1[mid]; - const detail::value_t key2 = keys2[diag - 1 - mid]; + const detail::it_value_t key1 = keys1[mid]; + const detail::it_value_t key2 = keys2[diag - 1 - mid]; if (binary_pred(key2, key1)) { keys1_end = mid; diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 425ba1c576b..199c87a8380 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -49,6 +49,7 @@ #include #include +#include #include #include #include @@ -242,7 +243,7 @@ private: LOG_PACKING_RATIO = Log2::VALUE, // Always at least one lane - LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), + LOG_COUNTER_LANES = _CUDA_VSTD::max(RADIX_BITS - LOG_PACKING_RATIO, 0), COUNTER_LANES = 1 << LOG_COUNTER_LANES, // The number of packed counters per thread (plus one for padding) @@ -254,7 +255,7 @@ public: enum { /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), + BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: @@ -587,7 +588,7 @@ public: enum { /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), + BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 397930de643..3e18fe3372a 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -50,6 +50,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -431,7 +432,7 @@ private: // Radix sorting passes while (true) { - int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit); auto digit_extractor = traits::template digit_extractor(begin_bit, pass_bits, decomposer); @@ -510,7 +511,7 @@ public: // Radix sorting passes while (true) { - int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit); auto digit_extractor = traits::template digit_extractor(begin_bit, pass_bits, decomposer); diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index a2cd74fcd90..ea179399b4e 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -1227,7 +1227,7 @@ public: }; #ifndef _CCCL_DOXYGEN_INVOKED // Do not document -template > +template > struct BlockStoreType { using type = cub::BlockStore; diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 28ff55b5fe0..6f962499984 100644 --- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -96,7 +96,7 @@ struct BlockReduceRakingCommutativeOnly RAKING_THREADS = WARP_THREADS, /// Number of threads actually sharing items with the raking threads - SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), + SHARING_THREADS = _CUDA_VSTD::max(1, BLOCK_THREADS - RAKING_THREADS), /// Number of raking elements per warp synchronous raking thread SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index 76de43da1d7..07faa04dca0 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -84,7 +84,7 @@ struct BlockReduceWarpReductions WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, /// The logical warp size for warp reductions - LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + LOGICAL_WARP_SIZE = _CUDA_VSTD::min(BLOCK_THREADS, WARP_THREADS), /// Whether or not the logical warp size evenly divides the thread block size EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh index 9359a176905..a0e15dbdd16 100644 --- a/cub/cub/device/device_for.cuh +++ b/cub/cub/device/device_for.cuh @@ -44,15 +44,16 @@ #include #include -#include #include #include #include +#include +#include + #if __cccl_lib_mdspan # include #endif // __cccl_lib_mdspan -#include CUB_NAMESPACE_BEGIN @@ -155,7 +156,8 @@ private: ContiguousIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */) { auto* unwrapped_first = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(first); - using wrapped_op_t = detail::for_each::op_wrapper_vectorized_t>; + using wrapped_op_t = + detail::for_each::op_wrapper_vectorized_t>; if (is_aligned(unwrapped_first)) { // Vectorize loads @@ -594,7 +596,7 @@ private: using offset_t = NumItemsT; // Disable auto-vectorization for now: // constexpr bool use_vectorization = - // detail::for_each::can_regain_copy_freedom, OpT>::value + // detail::for_each::can_regain_copy_freedom, OpT>::value // && THRUST_NS_QUALIFIER::is_contiguous_iterator::value; using use_vectorization_t = ::cuda::std::bool_constant; return for_each_n(first, num_items, op, stream, use_vectorization_t{}); @@ -706,10 +708,8 @@ public: { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEach"); - using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits::difference_type; - + using offset_t = detail::it_difference_t; const auto num_items = static_cast(THRUST_NS_QUALIFIER::distance(first, last)); - return ForEachNNoNVTX(first, num_items, op, stream); } @@ -835,7 +835,7 @@ public: ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopy"); - using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits::difference_type; + using offset_t = detail::it_difference_t; const auto num_items = static_cast(THRUST_NS_QUALIFIER::distance(first, last)); return ForEachCopyNNoNVTX(first, num_items, op, stream); } diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 29a8721a69a..eacda52673d 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -190,7 +190,7 @@ struct DeviceHistogram cudaStream_t stream = 0) { /// The sample value type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; return MultiHistogramEven<1, 1>( d_temp_storage, temp_storage_bytes, @@ -509,7 +509,7 @@ struct DeviceHistogram cudaStream_t stream = 0) { /// The sample value type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; return MultiHistogramEven( d_temp_storage, @@ -700,7 +700,7 @@ struct DeviceHistogram CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramEven"); /// The sample value type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; ::cuda::std::bool_constant is_byte_sample; if constexpr (sizeof(OffsetT) > sizeof(int)) @@ -850,7 +850,7 @@ struct DeviceHistogram cudaStream_t stream = 0) { /// The sample value type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; return MultiHistogramRange<1, 1>( d_temp_storage, temp_storage_bytes, @@ -1145,7 +1145,7 @@ struct DeviceHistogram cudaStream_t stream = 0) { /// The sample value type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; return MultiHistogramRange( d_temp_storage, @@ -1326,7 +1326,7 @@ struct DeviceHistogram CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange"); /// The sample value type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; ::cuda::std::bool_constant is_byte_sample; if constexpr (sizeof(OffsetT) > sizeof(int)) diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh index 997dfecd4c3..68dcc806bc5 100644 --- a/cub/cub/device/device_memcpy.cuh +++ b/cub/cub/device/device_memcpy.cuh @@ -175,10 +175,10 @@ struct DeviceMemcpy cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMemcpy::Batched"); - static_assert(::cuda::std::is_pointer_v>, + static_assert(::cuda::std::is_pointer_v>, "DeviceMemcpy::Batched only supports copying of memory buffers." "Please consider using DeviceCopy::Batched instead."); - static_assert(::cuda::std::is_pointer_v>, + static_assert(::cuda::std::is_pointer_v>, "DeviceMemcpy::Batched only supports copying of memory buffers." "Please consider using DeviceCopy::Batched instead."); diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 4a57ca3d7f2..b286afa31df 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -51,6 +51,8 @@ #include +#include + #include CUB_NAMESPACE_BEGIN @@ -316,7 +318,7 @@ struct DeviceReduce using OffsetT = detail::choose_offset_t; // The output value type - using OutputT = cub::detail::non_void_value_t>; + using OutputT = cub::detail::non_void_value_t>; using InitT = OutputT; @@ -334,7 +336,7 @@ struct DeviceReduce //! @rst //! Computes a device-wide minimum using the less-than (``<``) operator. //! - //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction. + //! - Uses ``::cuda::std::numeric_limits::max()`` as the initial value of the reduction. //! - Does not support ``<`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. @@ -422,7 +424,7 @@ struct DeviceReduce using OffsetT = detail::choose_offset_t; // The input value type - using InputT = cub::detail::value_t; + using InputT = cub::detail::it_value_t; using InitT = InputT; @@ -433,8 +435,7 @@ struct DeviceReduce d_out, static_cast(num_items), ::cuda::minimum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::max() (breaking change) - Traits::Max(), + ::cuda::std::numeric_limits::max(), stream); } @@ -538,7 +539,7 @@ struct DeviceReduce CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin"); // The input type - using InputValueT = cub::detail::value_t; + using InputValueT = cub::detail::it_value_t; // Offset type used within the kernel and to index within one partition using PerPartitionOffsetT = int; @@ -583,7 +584,7 @@ struct DeviceReduce //! (assuming the value type of ``d_in`` is ``T``) //! //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``. - //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::max()}`` tuple is produced for zero-length inputs //! //! - Does not support ``<`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction @@ -672,7 +673,7 @@ struct DeviceReduce using OffsetT = int; // The input type - using InputValueT = cub::detail::value_t; + using InputValueT = cub::detail::it_value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; @@ -690,8 +691,7 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Max())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::max())}; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream); @@ -700,7 +700,7 @@ struct DeviceReduce //! @rst //! Computes a device-wide maximum using the greater-than (``>``) operator. //! - //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. + //! - Uses ``::cuda::std::numeric_limits::lowest()`` as the initial value of the reduction. //! - Does not support ``>`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. @@ -785,7 +785,7 @@ struct DeviceReduce using OffsetT = detail::choose_offset_t; // The input value type - using InputT = cub::detail::value_t; + using InputT = cub::detail::it_value_t; using InitT = InputT; @@ -796,8 +796,7 @@ struct DeviceReduce d_out, static_cast(num_items), ::cuda::maximum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::lowest() (breaking change) - Traits::Lowest(), + ::cuda::std::numeric_limits::lowest(), stream); } @@ -901,7 +900,7 @@ struct DeviceReduce CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax"); // The input type - using InputValueT = cub::detail::value_t; + using InputValueT = cub::detail::it_value_t; // Offset type used within the kernel and to index within one partition using PerPartitionOffsetT = int; @@ -948,7 +947,7 @@ struct DeviceReduce //! //! - The maximum is written to ``d_out.value`` and its offset in the input //! array is written to ``d_out.key``. - //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs //! //! - Does not support ``>`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction @@ -1039,7 +1038,7 @@ struct DeviceReduce using OffsetT = int; // The input type - using InputValueT = cub::detail::value_t; + using InputValueT = cub::detail::it_value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; @@ -1057,9 +1056,7 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - // TODO(bgruber): replace with ::cuda::std::numeric_limits::lowest() (breaking change) - InitT initial_value{AccumT(1, Traits::Lowest())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::lowest())}; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream); diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 21ed7bd77ec..b308f4c2b65 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -205,7 +205,7 @@ struct DeviceRunLengthEncode using accum_t = ::cuda::std::__accumulator_t; - using key_t = cub::detail::non_void_value_t>; + using key_t = cub::detail::non_void_value_t>; using policy_t = detail::rle::encode::policy_hub; diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 74f351f1060..84514d3ff4d 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -191,7 +191,7 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using InitT = cub::detail::value_t; + using InitT = cub::detail::it_value_t; // Initial value InitT init_value{}; @@ -1156,7 +1156,7 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using AccumT = ::cuda::std::__accumulator_t, InitValueT>; + using AccumT = ::cuda::std::__accumulator_t, InitValueT>; return DispatchScan< InputIteratorT, @@ -1390,7 +1390,7 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using InitT = cub::detail::value_t; + using InitT = cub::detail::it_value_t; // Initial value InitT init_value{}; diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 642f2521d97..35cb0e81132 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -49,6 +49,7 @@ #include #include +#include #include #include @@ -365,7 +366,7 @@ public: using OffsetT = detail::common_iterator_value_t; // The output value type - using OutputT = cub::detail::non_void_value_t>; + using OutputT = cub::detail::non_void_value_t>; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); @@ -392,7 +393,7 @@ public: //! @rst //! Computes a device-wide segmented minimum using the less-than (``<``) operator. //! - //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction for each segment. + //! - Uses ``::cuda::std::numeric_limits::max()`` as the initial value of the reduction for each segment. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is @@ -488,7 +489,7 @@ public: using OffsetT = detail::common_iterator_value_t; // The input value type - using InputT = cub::detail::value_t; + using InputT = cub::detail::it_value_t; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); @@ -508,8 +509,7 @@ public: d_begin_offsets, d_end_offsets, ::cuda::minimum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::max() (breaking change) - Traits::Max(), + ::cuda::std::numeric_limits::max(), stream); } @@ -522,7 +522,7 @@ public: //! //! - The minimum of the *i*\ :sup:`th` segment is written to //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. - //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::max()}`` tuple is produced for zero-length inputs //! //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both @@ -618,7 +618,7 @@ public: using OffsetT = int; // detail::common_iterator_value_t; // The input type - using InputValueT = cub::detail::value_t; + using InputValueT = cub::detail::it_value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; @@ -636,8 +636,7 @@ public: ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Max())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::max())}; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); @@ -666,7 +665,7 @@ public: //! @rst //! Computes a device-wide segmented maximum using the greater-than (``>``) operator. //! - //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. + //! - Uses ``::cuda::std::numeric_limits::lowest()`` as the initial value of the reduction. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where @@ -756,7 +755,7 @@ public: using OffsetT = detail::common_iterator_value_t; // The input value type - using InputT = cub::detail::value_t; + using InputT = cub::detail::it_value_t; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); @@ -771,8 +770,7 @@ public: d_begin_offsets, d_end_offsets, ::cuda::maximum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::lowest() (breaking change) - Traits::Lowest(), + ::cuda::std::numeric_limits::lowest(), stream); } @@ -785,7 +783,7 @@ public: //! //! - The maximum of the *i*\ :sup:`th` segment is written to //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. - //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs //! //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased @@ -884,7 +882,7 @@ public: using OffsetT = int; // detail::common_iterator_value_t; // The input type - using InputValueT = cub::detail::value_t; + using InputValueT = cub::detail::it_value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; @@ -902,8 +900,7 @@ public: ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Lowest())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::lowest())}; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 60f6585cf52..f74fc17ab94 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -122,7 +122,7 @@ template > struct DispatchAdjacentDifference { - using InputT = typename std::iterator_traits::value_type; + using InputT = detail::it_value_t; void* d_temp_storage; size_t& temp_storage_bytes; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index f24c14df86b..c47fc8ae016 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -56,6 +56,7 @@ #include #include +#include #include #include @@ -110,14 +111,14 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO { using StatusWord = typename TileT::StatusWord; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT; - using BufferSizeT = value_t; + using BufferSizeT = it_value_t; /// Internal load/store type. For byte-wise memcpy, a single-byte type using AliasT = typename ::cuda::std::conditional_t, - std::iterator_traits>>::value_type; + ::cuda::std::type_identity, + lazy_trait>>::type; /// Types of the input and output buffers - using InputBufferT = value_t; - using OutputBufferT = value_t; + using InputBufferT = it_value_t; + using OutputBufferT = it_value_t; constexpr uint32_t BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS; constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD; @@ -240,7 +241,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO BLevBlockOffsetTileState blev_block_scan_state) { // Internal type used for storing a buffer's size - using BufferSizeT = value_t; + using BufferSizeT = it_value_t; // Alias the correct tuning policy for the current compilation pass' architecture using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT; @@ -313,7 +314,7 @@ struct DispatchBatchMemcpy using BufferTileOffsetScanStateT = typename cub::ScanTileState; // Internal type used to keep track of a buffer's size - using BufferSizeT = cub::detail::value_t; + using BufferSizeT = cub::detail::it_value_t; //------------------------------------------------------------------------------ // Member Variables @@ -397,9 +398,9 @@ struct DispatchBatchMemcpy BlockOffsetT num_tiles = ::cuda::ceil_div(num_buffers, TILE_SIZE); using BlevBufferSrcsOutT = - ::cuda::std::_If>; + ::cuda::std::_If>; using BlevBufferDstOutT = - ::cuda::std::_If>; + ::cuda::std::_If>; using BlevBufferSrcsOutItT = BlevBufferSrcsOutT*; using BlevBufferDstsOutItT = BlevBufferDstOutT*; using BlevBufferSizesOutItT = BufferSizeT*; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index f63af0fa2f4..d5a894c3310 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -569,7 +569,7 @@ public: //--------------------------------------------------------------------- /// The sample value type of the input iterator - using SampleT = cub::detail::value_t; + using SampleT = cub::detail::it_value_t; enum { @@ -924,7 +924,7 @@ public: // Should we call DispatchHistogram<....., PolicyHub=void> in DeviceHistogram? static constexpr bool isEven = 0; using fallback_policy_hub = detail::histogram:: - policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; using MaxPolicyT = typename ::cuda::std::_If<::cuda::std::is_void_v, fallback_policy_hub, PolicyHub>::MaxPolicy; @@ -1100,7 +1100,7 @@ public: { static constexpr bool isEven = 0; using fallback_policy_hub = detail::histogram:: - policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; using MaxPolicyT = typename ::cuda::std::_If<::cuda::std::is_void_v, fallback_policy_hub, PolicyHub>::MaxPolicy; @@ -1240,7 +1240,7 @@ public: { static constexpr bool isEven = 1; using fallback_policy_hub = detail::histogram:: - policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; using MaxPolicyT = typename ::cuda::std::_If<::cuda::std::is_void_v, fallback_policy_hub, PolicyHub>::MaxPolicy; @@ -1431,7 +1431,7 @@ public: { static constexpr bool isEven = 1; using fallback_policy_hub = detail::histogram:: - policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; using MaxPolicyT = typename ::cuda::std::_If<::cuda::std::is_void_v, fallback_policy_hub, PolicyHub>::MaxPolicy; diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index f9b40d053a5..af65a150d99 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -119,7 +119,7 @@ __launch_bounds__( vsmem_t global_temp_storage) { // the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed - using key_t = value_t; + using key_t = it_value_t; static_assert(::cuda::std::__invokable::value, "Comparison operator cannot compare two keys"); static_assert(::cuda::std::is_convertible_v::type, bool>, @@ -164,7 +164,7 @@ template , value_t>> + typename PolicyHub = detail::merge::policy_hub, it_value_t>> struct dispatch_t { void* d_temp_storage; diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 330a30dd940..6cd0c75a3fa 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -64,8 +64,8 @@ template struct DeviceMergeSortKernelSource { - using KeyT = cub::detail::value_t; - using ValueT = cub::detail::value_t; + using KeyT = cub::detail::it_value_t; + using ValueT = cub::detail::it_value_t; CUB_DEFINE_KERNEL_GETTER( MergeSortBlockSortKernel, @@ -127,10 +127,10 @@ template < ValueIteratorT, OffsetT, CompareOpT, - cub::detail::value_t, - cub::detail::value_t>, - typename KeyT = cub::detail::value_t, - typename ValueT = cub::detail::value_t> + cub::detail::it_value_t, + cub::detail::it_value_t>, + typename KeyT = cub::detail::it_value_t, + typename ValueT = cub::detail::it_value_t> struct DispatchMergeSort { /// Whether or not there are values to be trucked along with keys diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index 880a2cacce3..f967c21cf58 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -53,6 +53,7 @@ #include +#include #include #include @@ -275,7 +276,7 @@ struct DispatchRadixSort cudaError error = cudaSuccess; do { - int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, end_bit - current_bit); // Log upsweep_kernel configuration #ifdef CUB_DEBUG_LOG @@ -447,7 +448,7 @@ struct DispatchRadixSort max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0); even_share.DispatchInit( - num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + num_items, max_downsweep_grid_size, _CUDA_VSTD::max(downsweep_config.tile_size, upsweep_config.tile_size)); } while (0); return error; @@ -472,8 +473,8 @@ struct DispatchRadixSort constexpr PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS; int num_passes = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS); OffsetT num_portions = static_cast(::cuda::ceil_div(num_items, PORTION_SIZE)); - PortionOffsetT max_num_blocks = - ::cuda::ceil_div(static_cast(CUB_MIN(num_items, static_cast(PORTION_SIZE))), ONESWEEP_TILE_ITEMS); + PortionOffsetT max_num_blocks = ::cuda::ceil_div( + static_cast(_CUDA_VSTD::min(num_items, static_cast(PORTION_SIZE))), ONESWEEP_TILE_ITEMS); size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT); size_t allocation_sizes[] = { @@ -611,11 +612,11 @@ struct DispatchRadixSort for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { - int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS); + int num_bits = _CUDA_VSTD::min(end_bit - current_bit, RADIX_BITS); for (OffsetT portion = 0; portion < num_portions; ++portion) { - PortionOffsetT portion_num_items = static_cast( - CUB_MIN(num_items - portion * PORTION_SIZE, static_cast(PORTION_SIZE))); + PortionOffsetT portion_num_items = + static_cast(_CUDA_VSTD::min(num_items - portion * PORTION_SIZE, OffsetT{PORTION_SIZE})); PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS); @@ -777,7 +778,7 @@ struct DispatchRadixSort } // Get maximum spine length - int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); + int max_grid_size = _CUDA_VSTD::max(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; // Temporary storage allocation requirements @@ -812,7 +813,7 @@ struct DispatchRadixSort int num_passes = ::cuda::ceil_div(num_bits, pass_config.radix_bits); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); + int alt_end_bit = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); // Alias the temporary storage allocations OffsetT* d_spine = static_cast(allocations[0]); @@ -1241,7 +1242,7 @@ struct DispatchSegmentedRadixSort cudaError error = cudaSuccess; do { - int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, (end_bit - current_bit)); // Log kernel configuration #ifdef CUB_DEBUG_LOG @@ -1381,10 +1382,10 @@ struct DispatchSegmentedRadixSort int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; int num_bits = end_bit - begin_bit; - int num_passes = CUB_MAX(::cuda::ceil_div(num_bits, radix_bits), 1); + int num_passes = _CUDA_VSTD::max(::cuda::ceil_div(num_bits, radix_bits), 1); // num_bits may be zero bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * radix_bits) - num_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); + int alt_end_bit = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); DoubleBuffer d_keys_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index bc9faead2e3..8ac66edb6b0 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -137,8 +137,8 @@ template >, - typename AccumT = ::cuda::std::__accumulator_t, InitT>, + typename InitT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, typename TransformOpT = ::cuda::std::__identity, typename PolicyHub = detail::reduce::policy_hub, typename KernelSource = detail::reduce::DeviceReduceKernelSource< @@ -573,8 +573,10 @@ template < typename ReductionOpT, typename TransformOpT, typename InitT, - typename AccumT = ::cuda::std:: - __accumulator_t>, InitT>, + typename AccumT = + ::cuda::std::__accumulator_t>, + InitT>, typename PolicyHub = detail::reduce::policy_hub, typename KernelSource = detail::reduce::DeviceReduceKernelSource< typename PolicyHub::MaxPolicy, @@ -671,9 +673,9 @@ template >, - typename AccumT = ::cuda::std::__accumulator_t, InitT>, - typename PolicyHub = detail::reduce::policy_hub, + typename InitT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitT>, + typename PolicyHub = detail::reduce::policy_hub, typename KernelSource = detail::reduce::DeviceSegmentedReduceKernelSource< typename PolicyHub::MaxPolicy, InputIteratorT, diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index f3eb01d1e25..aa95fdd1342 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -226,12 +226,12 @@ template , - cub::detail::value_t>, + cub::detail::it_value_t, + cub::detail::it_value_t>, typename PolicyHub = detail::reduce_by_key::policy_hub< ReductionOpT, AccumT, - cub::detail::non_void_value_t>>> + cub::detail::non_void_value_t>>> struct DispatchReduceByKey { //------------------------------------------------------------------------- @@ -239,7 +239,7 @@ struct DispatchReduceByKey //------------------------------------------------------------------------- // The input values type - using ValueInputT = cub::detail::value_t; + using ValueInputT = cub::detail::it_value_t; static constexpr int INIT_KERNEL_THREADS = 128; @@ -344,7 +344,7 @@ struct DispatchReduceByKey } // Log init_kernel configuration - int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); + int init_grid_size = _CUDA_VSTD::max(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); @@ -392,7 +392,7 @@ struct DispatchReduceByKey } // Run grids in epochs (in case number of tiles exceeds max x-dimension - int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + int scan_grid_size = _CUDA_VSTD::min(num_tiles, max_dim_x); for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log reduce_by_key_kernel configuration diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 75e15980427..3ba14bd124f 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -194,7 +194,7 @@ template , - cub::detail::value_t>> + cub::detail::it_value_t>> struct DeviceRleDispatch { /****************************************************************************** diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 85834b10a35..fe0bc307d4f 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -145,13 +145,13 @@ template < typename InitValueT, typename OffsetT, typename AccumT = ::cuda::std::__accumulator_t, + cub::detail::it_value_t, ::cuda::std::_If<::cuda::std::is_same_v, - cub::detail::value_t, + cub::detail::it_value_t, typename InitValueT::value_type>>, ForceInclusive EnforceInclusive = ForceInclusive::No, - typename PolicyHub = - detail::scan::policy_hub, detail::value_t, AccumT, OffsetT, ScanOpT>, + typename PolicyHub = detail::scan:: + policy_hub, detail::it_value_t, AccumT, OffsetT, ScanOpT>, typename KernelSource = detail::scan::DeviceScanKernelSource< typename PolicyHub::MaxPolicy, InputIteratorT, diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 3290744fab5..c8db2e84995 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -136,7 +136,7 @@ template > + typename KeyT = cub::detail::it_value_t> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel( KeysInputIteratorT d_keys_in, @@ -176,7 +176,7 @@ template * d_keys_prev_in, + cub::detail::it_value_t* d_keys_prev_in, OffsetT items_per_tile, int num_tiles) { @@ -233,10 +233,11 @@ template < typename OffsetT, typename AccumT = ::cuda::std::__accumulator_t< ScanOpT, - cub::detail::value_t, - ::cuda::std::_If<::cuda::std::is_same_v, cub::detail::value_t, InitValueT>>, + cub::detail::it_value_t, + ::cuda::std:: + _If<::cuda::std::is_same_v, cub::detail::it_value_t, InitValueT>>, typename PolicyHub = - detail::scan_by_key::policy_hub, ScanOpT>> + detail::scan_by_key::policy_hub, ScanOpT>> struct DispatchScanByKey { //--------------------------------------------------------------------- @@ -246,10 +247,10 @@ struct DispatchScanByKey static constexpr int INIT_KERNEL_THREADS = 128; // The input key type - using KeyT = cub::detail::value_t; + using KeyT = cub::detail::it_value_t; // The input value type - using InputT = cub::detail::value_t; + using InputT = cub::detail::it_value_t; // Tile state used for the decoupled look-back using ScanByKeyTileStateT = ReduceByKeyScanTileState; diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index eb226ae754e..561e34a1fb8 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -421,8 +421,8 @@ template < typename OffsetT, SelectImpl SelectionOpt, typename PolicyHub = detail::select::policy_hub< - detail::value_t, - detail::value_t, + detail::it_value_t, + detail::it_value_t, // if/flagged/unique only have a single code path for different offset types, partition has different code paths ::cuda::std::conditional_t, detail::select::is_partition_distinct_output_t::value, diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 28b3a06cbd4..b560de56dc7 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -241,7 +241,7 @@ template , detail::three_way_partition::per_partition_offset_t>> + policy_hub, detail::three_way_partition::per_partition_offset_t>> struct DispatchThreeWayPartitionIf { /***************************************************************************** diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index e30c33a546d..9120319c49d 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -79,15 +79,16 @@ CUB_NAMESPACE_BEGIN * @tparam OffsetT * Signed integer type for global offsets */ -template , detail::value_t>> +template < + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyOutputIteratorT, + typename ValueOutputIteratorT, + typename NumSelectedIteratorT, + typename EqualityOpT, + typename OffsetT, + typename PolicyHub = + detail::unique_by_key::policy_hub, detail::it_value_t>> struct DispatchUniqueByKey { /****************************************************************************** @@ -100,8 +101,8 @@ struct DispatchUniqueByKey }; // The input key and value type - using KeyT = typename std::iterator_traits::value_type; - using ValueT = typename std::iterator_traits::value_type; + using KeyT = detail::it_value_t; + using ValueT = detail::it_value_t; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; diff --git a/cub/cub/device/dispatch/kernels/radix_sort.cuh b/cub/cub/device/dispatch/kernels/radix_sort.cuh index 65327df5ec4..d49cc11b2c8 100644 --- a/cub/cub/device/dispatch/kernels/radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/radix_sort.cuh @@ -22,6 +22,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN /****************************************************************************** @@ -98,8 +100,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp enum { - TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, - ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) + TILE_ITEMS = _CUDA_VSTD::max(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, + ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) }; // Parameterize AgentRadixSortUpsweep type for the current configuration @@ -258,8 +260,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo enum { - TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, - ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) + TILE_ITEMS = _CUDA_VSTD::max(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, + ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) }; // Parameterize AgentRadixSortDownsweep type for the current configuration diff --git a/cub/cub/device/dispatch/kernels/transform.cuh b/cub/cub/device/dispatch/kernels/transform.cuh index 51193346036..42ba275c07d 100644 --- a/cub/cub/device/dispatch/kernels/transform.cuh +++ b/cub/cub/device/dispatch/kernels/transform.cuh @@ -59,7 +59,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int tile_size) { constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to // be architecture dependent? - const int tile_size_bytes = tile_size * sizeof(value_t); + const int tile_size_bytes = tile_size * sizeof(it_value_t); // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling #pragma unroll 1 for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes; @@ -361,7 +361,7 @@ _CCCL_DEVICE void transform_kernel_impl( template union kernel_arg { - aligned_base_ptr> aligned_ptr; // first member is trivial + aligned_base_ptr> aligned_ptr; // first member is trivial It iterator; // may not be trivially [default|copy]-constructible static_assert(::cuda::std::is_trivial_v, ""); diff --git a/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh b/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh index b8d0a7557bd..b2fc266bfc9 100644 --- a/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh @@ -50,7 +50,7 @@ namespace adjacent_difference template struct policy_hub { - using ValueT = typename std::iterator_traits::value_type; + using ValueT = it_value_t; struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { diff --git a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh index 2c93b1b1147..8064337ab28 100644 --- a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh @@ -73,7 +73,7 @@ CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper MakeMergeSortPolicyWrapper( template struct policy_hub { - using KeyT = value_t; + using KeyT = it_value_t; struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 7d7e4ecd738..321749532e2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -45,7 +45,8 @@ #include #include -#include +#include +#include CUB_NAMESPACE_BEGIN @@ -871,9 +872,8 @@ struct policy_hub static constexpr int items_per_thread = (max_input_bytes <= 8) ? 6 - // TODO(bgruber): use ceil_div and clamp in C++14 - : CUB_MIN(nominal_4B_items_per_thread, - CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 0e46a47d49d..9f272961ef2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -46,7 +46,8 @@ #include #include -#include +#include +#include CUB_NAMESPACE_BEGIN @@ -315,9 +316,8 @@ struct policy_hub static constexpr int items = (max_input_bytes <= 8) ? 6 - // TODO(bgruber): use clamp() and ceil_div in C++14 - : CUB_MIN(nominal_4B_items_per_thread, - CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, items, @@ -603,7 +603,7 @@ struct policy_hub static constexpr int nominal_4B_items_per_thread = 15; // TODO(bgruber): use clamp() in C++14 static constexpr int ITEMS_PER_THREAD = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT)))); + _CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread); using RleSweepPolicyT = AgentRlePolicy<96, ITEMS_PER_THREAD, diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index 3723f1d84f6..4695ce58d66 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -912,7 +912,7 @@ struct sm100_tuning struct policy_hub { - using key_t = value_t; + using key_t = it_value_t; static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(key_t), sizeof(AccumT))); static constexpr int combined_input_bytes = static_cast(sizeof(key_t) + sizeof(AccumT)); diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index d3f99ee8ee8..ea2971dd3d2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -45,6 +45,8 @@ #include #include +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -1498,9 +1500,8 @@ struct policy_hub struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 10; - // TODO(bgruber): use cuda::std::clamp() in C++14 static constexpr int items_per_thread = - CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT)))); + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread); using SelectIfPolicyT = AgentSelectIfPolicy<128, items_per_thread, diff --git a/cub/cub/device/dispatch/tuning/tuning_transform.cuh b/cub/cub/device/dispatch/tuning/tuning_transform.cuh index 29c1a6d1a9b..ddd0482a69c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform.cuh @@ -108,7 +108,7 @@ _CCCL_HOST_DEVICE constexpr int sum(int head, Ts... tail) template _CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int { - return (int{sizeof(value_t)} + ... + 0); + return (int{sizeof(it_value_t)} + ... + 0); } constexpr int bulk_copy_alignment = 128; @@ -144,7 +144,7 @@ struct policy_hub...>; static constexpr bool all_values_trivially_reloc = - ::cuda::std::conjunction_v>...>; + ::cuda::std::conjunction_v>...>; static constexpr bool can_memcpy = all_contiguous && all_values_trivially_reloc; @@ -169,7 +169,7 @@ struct policy_hub int{max_smem_per_block}; static constexpr bool any_type_is_overalinged = - ((alignof(value_t) > bulk_copy_alignment) || ...); + ((alignof(it_value_t) > bulk_copy_alignment) || ...); static constexpr bool use_fallback = RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged; diff --git a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh index 57af475f1ba..093a17207e2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh @@ -849,6 +849,6 @@ struct policy_hub template using DeviceUniqueByKeyPolicy CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and it will " "be removed.") = - detail::unique_by_key::policy_hub, detail::value_t>; + detail::unique_by_key::policy_hub, detail::it_value_t>; CUB_NAMESPACE_END diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh index c3916fcb2bb..be5a0b14086 100644 --- a/cub/cub/iterator/arg_index_input_iterator.cuh +++ b/cub/cub/iterator/arg_index_input_iterator.cuh @@ -46,7 +46,6 @@ #include #include -#include #if !_CCCL_COMPILER(NVRTC) # include @@ -106,7 +105,7 @@ CUB_NAMESPACE_BEGIN */ template > + typename OutputValueT = detail::it_value_t> class ArgIndexInputIterator { public: diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh index ab97fae3525..e5d456aa011 100644 --- a/cub/cub/iterator/cache_modified_input_iterator.cuh +++ b/cub/cub/iterator/cache_modified_input_iterator.cuh @@ -47,7 +47,6 @@ # include #else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ // vvv !_CCCL_COMPILER(NVRTC) vvv # include -# include # include # include diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh index fa4e501b80f..e447c27a9c2 100644 --- a/cub/cub/iterator/cache_modified_output_iterator.cuh +++ b/cub/cub/iterator/cache_modified_output_iterator.cuh @@ -47,7 +47,6 @@ #include #include -#include #include diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh index b70819bde86..4be625c6fcc 100644 --- a/cub/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/cub/iterator/tex_obj_input_iterator.cuh @@ -48,7 +48,6 @@ #include #include -#include #include diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index 14b7269dfff..37c45d31536 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -106,7 +106,7 @@ enum CacheLoadModifier * [inferred] The input's iterator type \iterator */ template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(RandomAccessIterator itr); +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::it_value_t ThreadLoad(RandomAccessIterator itr); //@} end member group @@ -306,7 +306,7 @@ _CUB_LOAD_ALL(LOAD_LDG, global.nc) * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types */ template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad( +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::it_value_t ThreadLoad( RandomAccessIterator itr, detail::constant_t /*modifier*/, ::cuda::std::false_type /*is_pointer*/) { return *itr; @@ -374,7 +374,7 @@ ThreadLoad(T const* ptr, detail::constant_t /*modifier*/, ::cuda::std: } template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(RandomAccessIterator itr) +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::it_value_t ThreadLoad(RandomAccessIterator itr) { return ThreadLoad( itr, detail::constant_v, detail::bool_constant_v<::cuda::std::is_pointer_v>); diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh index 6471a55b351..dfd2e07f2c7 100644 --- a/cub/cub/thread/thread_search.cuh +++ b/cub/cub/thread/thread_search.cuh @@ -60,7 +60,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void MergePathSearch( OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT& path_coordinate) { /// The value type of the input iterator - using T = cub::detail::value_t; + using T = cub::detail::it_value_t; OffsetT split_min = CUB_MAX(diagonal - b_len, 0); OffsetT split_max = CUB_MIN(diagonal, a_len); diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index 16552e12258..82228215b7d 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -137,15 +137,6 @@ struct MemBoundScaling }; } // namespace detail - -template -using RegBoundScaling CCCL_DEPRECATED_BECAUSE("Internal implementation detail") = - detail::RegBoundScaling; - -template -using MemBoundScaling CCCL_DEPRECATED_BECAUSE("Internal implementation detail") = - detail::RegBoundScaling; - #endif // Do not document CUB_NAMESPACE_END diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index 206e815a761..d53a6cdc70a 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -52,28 +52,6 @@ CUB_NAMESPACE_BEGIN * Inlined PTX intrinsics ******************************************************************************/ -/** - * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend) -{ - unsigned int ret; - asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); - return ret; -} - -/** - * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend) -{ - unsigned int ret; - asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); - return ret; -} - #ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** @@ -124,135 +102,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned in return BFE(source, bit_start, num_bits, detail::constant_v); } -/** - * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE void -BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits) -{ - asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); -} - -/** - * \brief Three-operand add. Returns \p x + \p y + \p z. - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) -{ - asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); - return x; -} - -/** - * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit - * destination register. For SM2.0 or later. - * - * \par - * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: - * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes - * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within - * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} - * - * \par Snippet - * The code snippet below illustrates byte-permute. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * int a = 0x03020100; - * int b = 0x07060504; - * int index = 0x00007531; - * - * int selected = PRMT(a, b, index); // 0x07050301 - * - * \endcode - * - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index) -{ - int ret; - asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); - return ret; -} - #ifndef _CCCL_DOXYGEN_INVOKED // Do not document -/** - * Sync-threads barrier. - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count) -{ - asm volatile("bar.sync 1, %0;" : : "r"(count)); -} - -/** - * CTA barrier - */ -CCCL_DEPRECATED_BECAUSE("use __syncthreads() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC() -{ - __syncthreads(); -} - -/** - * CTA barrier with predicate - */ -CCCL_DEPRECATED_BECAUSE("use __syncthreads_and() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p) -{ - return __syncthreads_and(p); -} - -/** - * CTA barrier with predicate - */ -CCCL_DEPRECATED_BECAUSE("use __syncthreads_or() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p) -{ - return __syncthreads_or(p); -} - -/** - * Warp barrier - */ -CCCL_DEPRECATED_BECAUSE("use __syncwarp() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask) -{ - __syncwarp(member_mask); -} - -/** - * Warp any - */ -CCCL_DEPRECATED_BECAUSE("use __any_sync() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask) -{ - return __any_sync(member_mask, predicate); -} - -/** - * Warp any - */ -CCCL_DEPRECATED_BECAUSE("use __all_sync() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask) -{ - return __all_sync(member_mask, predicate); -} - -/** - * Warp ballot - */ -CCCL_DEPRECATED_BECAUSE("use __ballot_sync() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask) -{ - return __ballot_sync(member_mask, predicate); -} - /** * Warp synchronous shfl_up */ @@ -277,50 +128,6 @@ SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member return word; } -/** - * Warp synchronous shfl_idx - */ -CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int -SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) -{ - asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" - : "=r"(word) - : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); - return word; -} - -/** - * Warp synchronous shfl_idx - */ -CCCL_DEPRECATED_BECAUSE("use __shfl_sync() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) -{ - return __shfl_sync(member_mask, word, src_lane); -} - -/** - * Floating point multiply. (Mantissa LSB rounds towards zero.) - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b) -{ - float d; - asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); - return d; -} - -/** - * Floating point multiply-add. (Mantissa LSB rounds towards zero.) - */ -CCCL_DEPRECATED_BECAUSE("will be removed in the next major release") -_CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c) -{ - float d; - asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); - return d; -} - #endif // _CCCL_DOXYGEN_INVOKED /** @@ -331,15 +138,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit() asm volatile("exit;"); } -/** - * \brief Abort execution and generate an interrupt to the host CPU - */ -CCCL_DEPRECATED_BECAUSE("use cuda::std::terminate() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap() -{ - asm volatile("trap;"); -} - /** * \brief Returns the row-major linear thread identifier for a multidimensional thread block */ @@ -349,29 +147,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y, + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + threadIdx.x; } -/** - * \brief Returns the warp lane ID of the calling thread - */ -CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_laneid() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId() -{ - unsigned int ret; - asm("mov.u32 %0, %%laneid;" : "=r"(ret)); - return ret; -} - -/** - * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not - * correspond to a zero-based ranking within the thread block. - */ -CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_warpid() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId() -{ - unsigned int ret; - asm("mov.u32 %0, %%warpid;" : "=r"(ret)); - return ret; -} - /** * @brief Returns the warp mask for a warp of @p LOGICAL_WARP_THREADS threads * @@ -401,50 +176,6 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id) return member_mask; } -/** - * \brief Returns the warp lane mask of all lanes less than the calling thread - */ -CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_lt() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt() -{ - unsigned int ret; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret)); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread - */ -CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_le() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe() -{ - unsigned int ret; - asm("mov.u32 %0, %%lanemask_le;" : "=r"(ret)); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes greater than the calling thread - */ -CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_gt() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt() -{ - unsigned int ret; - asm("mov.u32 %0, %%lanemask_gt;" : "=r"(ret)); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread - */ -CCCL_DEPRECATED_BECAUSE("use cuda::ptx::get_sreg_lanemask_ge() instead") -_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe() -{ - unsigned int ret; - asm("mov.u32 %0, %%lanemask_ge;" : "=r"(ret)); - return ret; -} - /** * @brief Shuffle-up for any data type. * Each warp-lanei obtains the value @p input contributed by diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 916cf6571ba..073655e42dd 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -49,6 +49,7 @@ #include #include +#include #include #include @@ -70,12 +71,6 @@ _CCCL_DIAG_PUSH _CCCL_DIAG_POP #endif // _CCCL_HAS_NVFP8() -#if _CCCL_COMPILER(NVRTC) -# include -#else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ // vvv !_CCCL_COMPILER(NVRTC) vvv -# include -#endif // _CCCL_COMPILER(NVRTC) - CUB_NAMESPACE_BEGIN /****************************************************************************** @@ -85,16 +80,29 @@ CUB_NAMESPACE_BEGIN #ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { -//! Alias to the given iterator's value_type. -// Aliases to std::iterator_traits, since users can specialize this template to provide traits for their iterators. We -// only defer to the libcu++ implementation for NVRTC. -template -using value_t = -# if _CCCL_COMPILER(NVRTC) - typename ::cuda::std::iterator_traits::value_type; -# else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ // vvv !_CCCL_COMPILER(NVRTC) vvv - typename std::iterator_traits::value_type; -# endif // !_CCCL_COMPILER(NVRTC) +// the following iterator helpers are not named iter_value_t etc, like the C++20 facilities, because they are defined in +// terms of C++17 iterator_traits and not the new C++20 indirectly_readable trait etc. This allows them to detect nested +// value_type, difference_type and reference aliases, which the new C+20 traits do not consider (they only consider +// specializations of iterator_traits). Also, a value_type of void remains supported (needed by some output iterators). + +template +using it_value_t = typename ::cuda::std::iterator_traits::value_type; + +template +using it_reference_t = typename ::cuda::std::iterator_traits::reference; + +template +using it_difference_t = typename ::cuda::std::iterator_traits::difference_type; + +template +using it_pointer_t = typename ::cuda::std::iterator_traits::pointer; + +// use this whenever you need to lazily evaluate a trait. E.g., as an alternative in replace_if_use_default. +template