From eca53a22ed3e4adc8a433a113a302fa5d823dc32 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 24 Feb 2025 13:40:48 +0100 Subject: [PATCH 01/17] Minimize usage of cub::Traits (#3863) * Replace all uses of cub::Traits other than radix sort key twiddling by numeric_limits * Drop obsolete specializations of cub::NumericTraits * Fix radix sort custom type example mentioning non-existent cub::RadixTraits * Replace cub::BaseTraits and cub::Traits by aliases so uses can no longer specialize it * Deprecate cub::Traits::Max|Lowest * Extend documentation of trait classes * Fix: readd template parameter to cub::BaseTraits to specify whether a type is primitive * Make is_primitive depend on cub::Traits again Fixes: #920 --- c2h/include/c2h/bfloat16.cuh | 4 +- c2h/include/c2h/half.cuh | 5 +- c2h/include/c2h/test_util_vec.h | 19 +- cub/benchmarks/bench/reduce/arg_extrema.cu | 5 +- cub/cub/device/device_reduce.cuh | 23 +- cub/cub/device/device_segmented_reduce.cuh | 21 +- cub/cub/util_type.cuh | 225 +++++++++++------- .../example_device_radix_sort_custom.cu | 12 +- cub/test/catch2_test_device_reduce.cuh | 23 -- .../catch2_test_device_segmented_sort_keys.cu | 2 + cub/test/catch2_test_util_type.cu | 27 +++ cub/test/test_util.h | 42 ---- 12 files changed, 196 insertions(+), 212 deletions(-) diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh index 6767850b373..de7f57158f6 100644 --- a/c2h/include/c2h/bfloat16.cuh +++ b/c2h/include/c2h/bfloat16.cuh @@ -266,12 +266,10 @@ public: }; _LIBCUDACXX_END_NAMESPACE_STD -_CCCL_SUPPRESS_DEPRECATED_PUSH template <> struct CUB_NS_QUALIFIER::NumericTraits - : CUB_NS_QUALIFIER::BaseTraits + : CUB_NS_QUALIFIER::BaseTraits {}; -_CCCL_SUPPRESS_DEPRECATED_POP #ifdef __GNUC__ # pragma GCC diagnostic pop diff --git a/c2h/include/c2h/half.cuh b/c2h/include/c2h/half.cuh index 4a30202fe3a..b29f3104a84 100644 --- a/c2h/include/c2h/half.cuh +++ b/c2h/include/c2h/half.cuh @@ -361,11 +361,10 @@ public: }; _LIBCUDACXX_END_NAMESPACE_STD -_CCCL_SUPPRESS_DEPRECATED_PUSH template <> -struct CUB_NS_QUALIFIER::NumericTraits : CUB_NS_QUALIFIER::BaseTraits +struct CUB_NS_QUALIFIER::NumericTraits + : CUB_NS_QUALIFIER::BaseTraits {}; -_CCCL_SUPPRESS_DEPRECATED_POP #ifdef __GNUC__ # pragma GCC diagnostic pop diff --git a/c2h/include/c2h/test_util_vec.h b/c2h/include/c2h/test_util_vec.h index 42e5a33ef7e..01022be9777 100644 --- a/c2h/include/c2h/test_util_vec.h +++ b/c2h/include/c2h/test_util_vec.h @@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long) C2H_VEC_OVERLOAD(float, float) C2H_VEC_OVERLOAD(double, double) -// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types. +// Specialize cuda::std::numeric_limits for vector types. # define REPEAT_TO_LIST_1(a) a # define REPEAT_TO_LIST_2(a) a, a @@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double) # define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a) # define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N) \ - CUB_NAMESPACE_BEGIN \ - template <> \ - struct NumericTraits \ - { \ - static __host__ __device__ T Max() \ - { \ - T retval = {REPEAT_TO_LIST(N, NumericTraits::Max())}; \ - return retval; \ - } \ - static __host__ __device__ T Lowest() \ - { \ - T retval = {REPEAT_TO_LIST(N, NumericTraits::Lowest())}; \ - return retval; \ - } \ - }; \ - CUB_NAMESPACE_END \ - \ _LIBCUDACXX_BEGIN_NAMESPACE_STD \ template <> \ class numeric_limits \ diff --git a/cub/benchmarks/bench/reduce/arg_extrema.cu b/cub/benchmarks/bench/reduce/arg_extrema.cu index 428dc8b2538..37adfe275ab 100644 --- a/cub/benchmarks/bench/reduce/arg_extrema.cu +++ b/cub/benchmarks/bench/reduce/arg_extrema.cu @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -57,7 +58,9 @@ struct policy_hub_t // Type used for the final result using output_tuple_t = cub::KeyValuePair; - auto const init = ::cuda::std::is_same_v ? cub::Traits::Max() : cub::Traits::Lowest(); + auto const init = ::cuda::std::is_same_v + ? ::cuda::std::numeric_limits::max() + : ::cuda::std::numeric_limits::lowest(); #if !TUNE_BASE using policy_t = policy_hub_t; diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 4a57ca3d7f2..93eb2024f67 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -51,6 +51,8 @@ #include +#include + #include CUB_NAMESPACE_BEGIN @@ -334,7 +336,7 @@ struct DeviceReduce //! @rst //! Computes a device-wide minimum using the less-than (``<``) operator. //! - //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction. + //! - Uses ``::cuda::std::numeric_limits::max()`` as the initial value of the reduction. //! - Does not support ``<`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. @@ -433,8 +435,7 @@ struct DeviceReduce d_out, static_cast(num_items), ::cuda::minimum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::max() (breaking change) - Traits::Max(), + ::cuda::std::numeric_limits::max(), stream); } @@ -583,7 +584,7 @@ struct DeviceReduce //! (assuming the value type of ``d_in`` is ``T``) //! //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``. - //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::max()}`` tuple is produced for zero-length inputs //! //! - Does not support ``<`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction @@ -690,8 +691,7 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Max())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::max())}; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream); @@ -700,7 +700,7 @@ struct DeviceReduce //! @rst //! Computes a device-wide maximum using the greater-than (``>``) operator. //! - //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. + //! - Uses ``::cuda::std::numeric_limits::lowest()`` as the initial value of the reduction. //! - Does not support ``>`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. @@ -796,8 +796,7 @@ struct DeviceReduce d_out, static_cast(num_items), ::cuda::maximum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::lowest() (breaking change) - Traits::Lowest(), + ::cuda::std::numeric_limits::lowest(), stream); } @@ -948,7 +947,7 @@ struct DeviceReduce //! //! - The maximum is written to ``d_out.value`` and its offset in the input //! array is written to ``d_out.key``. - //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs //! //! - Does not support ``>`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction @@ -1057,9 +1056,7 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - // TODO(bgruber): replace with ::cuda::std::numeric_limits::lowest() (breaking change) - InitT initial_value{AccumT(1, Traits::Lowest())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::lowest())}; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream); diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 642f2521d97..551c487a847 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -49,6 +49,7 @@ #include #include +#include #include #include @@ -392,7 +393,7 @@ public: //! @rst //! Computes a device-wide segmented minimum using the less-than (``<``) operator. //! - //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction for each segment. + //! - Uses ``::cuda::std::numeric_limits::max()`` as the initial value of the reduction for each segment. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is @@ -508,8 +509,7 @@ public: d_begin_offsets, d_end_offsets, ::cuda::minimum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::max() (breaking change) - Traits::Max(), + ::cuda::std::numeric_limits::max(), stream); } @@ -522,7 +522,7 @@ public: //! //! - The minimum of the *i*\ :sup:`th` segment is written to //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. - //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::max()}`` tuple is produced for zero-length inputs //! //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both @@ -636,8 +636,7 @@ public: ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Max())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::max())}; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); @@ -666,7 +665,7 @@ public: //! @rst //! Computes a device-wide segmented maximum using the greater-than (``>``) operator. //! - //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. + //! - Uses ``::cuda::std::numeric_limits::lowest()`` as the initial value of the reduction. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where @@ -771,8 +770,7 @@ public: d_begin_offsets, d_end_offsets, ::cuda::maximum<>{}, - // TODO(bgruber): replace with ::cuda::std::numeric_limits::lowest() (breaking change) - Traits::Lowest(), + ::cuda::std::numeric_limits::lowest(), stream); } @@ -785,7 +783,7 @@ public: //! //! - The maximum of the *i*\ :sup:`th` segment is written to //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. - //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs + //! - The ``{1, ::cuda::std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs //! //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased @@ -902,8 +900,7 @@ public: ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Lowest())}; + InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::lowest())}; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 916cf6571ba..04d0062f048 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -780,18 +780,21 @@ enum Category FLOATING_POINT }; -/** - * \brief Basic type traits - */ -template +namespace detail +{ +struct is_primitive_impl; + +template struct BaseTraits -{}; +{ +private: + friend struct is_primitive_impl; + + static constexpr bool is_primitive = _PRIMITIVE; +}; -/** - * Basic type traits (unsigned primitive specialization) - */ template -struct BaseTraits +struct BaseTraits { static_assert(::cuda::std::numeric_limits::is_specialized, "Please also specialize cuda::std::numeric_limits for T"); @@ -810,6 +813,8 @@ struct BaseTraits return key; } + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::max()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { UnsignedBits retval_bits = MAX_KEY; @@ -818,6 +823,8 @@ struct BaseTraits return retval; } + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::lowest()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { UnsignedBits retval_bits = LOWEST_KEY; @@ -825,13 +832,15 @@ struct BaseTraits memcpy(&retval, &retval_bits, sizeof(T)); return retval; } + +private: + friend struct is_primitive_impl; + + static constexpr bool is_primitive = true; }; -/** - * Basic type traits (signed primitive specialization) - */ template -struct BaseTraits +struct BaseTraits { static_assert(::cuda::std::numeric_limits::is_specialized, "Please also specialize cuda::std::numeric_limits for T"); @@ -852,27 +861,35 @@ struct BaseTraits return key ^ HIGH_BIT; }; + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::max()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { UnsignedBits retval = MAX_KEY; return reinterpret_cast(retval); } + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::lowest()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { UnsignedBits retval = LOWEST_KEY; return reinterpret_cast(retval); } + +private: + friend struct is_primitive_impl; + + static constexpr bool is_primitive = true; }; -/** - * Basic type traits (fp primitive specialization) - */ template -struct BaseTraits +struct BaseTraits { static_assert(::cuda::std::numeric_limits::is_specialized, "Please also specialize cuda::std::numeric_limits for T"); + static_assert(::cuda::is_floating_point::value, "Please also specialize cuda::is_floating_point for T"); + static_assert(::cuda::is_floating_point_v, "Please also specialize cuda::is_floating_point_v for T"); using UnsignedBits = _UnsignedBits; @@ -892,48 +909,66 @@ struct BaseTraits return key ^ mask; }; + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::max()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { return ::cuda::std::numeric_limits::max(); } + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::lowest()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { return ::cuda::std::numeric_limits::lowest(); } -}; -/** - * \brief Numeric type traits - */ -// clang-format off -template struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits<(::cuda::std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, unsigned char, char> {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +private: + friend struct is_primitive_impl; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; + static constexpr bool is_primitive = true; +}; +} // namespace detail +//! Use this class as base when specializing \ref NumericTraits for primitive signed/unsigned integers or floating-point +//! types. +template +using BaseTraits = detail::BaseTraits<_CATEGORY, _PRIMITIVE, _UnsignedBits, T>; + +//! Numeric type traits for radix sort key operations, decoupled lookback and tuning. You can specialize this template +//! for your own types if: +//! * There is an unsigned integral type of equal size +//! * The size of the type is smaller than 64bits +//! * The arithmetic throughput of the type is similar to other built-in types of the same size +//! For other types, if you want to use them with radix sort, please use the decomposer interface of the radix sort. +// clang-format off +template struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits<(::cuda::std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, unsigned char, char> {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +// clang-format on -#if _CCCL_HAS_INT128() +# if _CCCL_HAS_INT128() template <> struct NumericTraits<__uint128_t> { - using T = __uint128_t; + using T = __uint128_t; using UnsignedBits = __uint128_t; - static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0); - static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1); + static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0); + static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1); static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key) { @@ -945,26 +980,35 @@ struct NumericTraits<__uint128_t> return key; } + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::max()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { return MAX_KEY; } + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::lowest()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { return LOWEST_KEY; } + +private: + friend struct detail::is_primitive_impl; + + static constexpr bool is_primitive = false; }; template <> struct NumericTraits<__int128_t> { - using T = __int128_t; + using T = __int128_t; using UnsignedBits = __uint128_t; - static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); - static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT; - static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT; + static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key) { @@ -976,83 +1020,82 @@ struct NumericTraits<__int128_t> return key ^ HIGH_BIT; }; + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::max()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { UnsignedBits retval = MAX_KEY; return reinterpret_cast(retval); } + //! deprecated [Since 3.0] + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits::lowest()") static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { UnsignedBits retval = LOWEST_KEY; return reinterpret_cast(retval); } + +private: + friend struct detail::is_primitive_impl; + + static constexpr bool is_primitive = false; }; -#endif // _CCCL_HAS_INT128() +# endif // _CCCL_HAS_INT128() -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +// clang-format off +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; # if _CCCL_HAS_NVFP16() - template <> struct NumericTraits<__half> : BaseTraits {}; + template <> struct NumericTraits<__half> : BaseTraits {}; # endif // _CCCL_HAS_NVFP16() # if _CCCL_HAS_NVBF16() - template <> struct NumericTraits<__nv_bfloat16> : BaseTraits {}; + template <> struct NumericTraits<__nv_bfloat16> : BaseTraits {}; # endif // _CCCL_HAS_NVBF16() #if _CCCL_HAS_NVFP8() - template <> struct NumericTraits<__nv_fp8_e4m3> : BaseTraits {}; - template <> struct NumericTraits<__nv_fp8_e5m2> : BaseTraits {}; + template <> struct NumericTraits<__nv_fp8_e4m3> : BaseTraits {}; + template <> struct NumericTraits<__nv_fp8_e5m2> : BaseTraits {}; #endif // _CCCL_HAS_NVFP8() -template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; +template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; // clang-format on -/** - * \brief Type traits - */ +namespace detail +{ template struct Traits : NumericTraits<::cuda::std::remove_cv_t> {}; +} // namespace detail + +//! \brief Query type traits for radix sort key operations, decoupled lookback and tunings. To add support for your own +//! primitive types please specialize \ref NumericTraits. +template +using Traits = detail::Traits; namespace detail { -// __uint128_t and __int128_t are not primitive +// we cannot befriend is_primitive on GCC < 11, since it's a template (bug) +struct is_primitive_impl +{ + // must be a struct instead of an alias, so the access of Traits::is_primitive happens in the context of this class + template + struct is_primitive : ::cuda::std::bool_constant::is_primitive> + {}; +}; +// This trait serves two purposes: +// 1. It is used for tunings to detect whether we have a build-in arithmetic type for which we can expect certain +// arithmetic throughput. E.g.: we expect all primitive types of the same size to show roughly similar performance. +// 2. Decoupled lookback uses this trait to determine whether there is a machine word twice the size of T which can be +// loaded/stored with a single instruction. +// TODO(bgruber): for 2. we should probably just check whether sizeof(T) * 2 <= sizeof(int128) (or 256-bit on SM100) +// Users must be able to hook into both scenarios with their custom types, so this trait must depend on cub::Traits template -using is_primitive = ::cuda::std::bool_constant()>; - -# ifndef _CCCL_NO_VARIABLE_TEMPLATES +struct is_primitive : is_primitive_impl::is_primitive +{}; + template -inline constexpr bool is_primitive_v = is_primitive::value; -# endif // !_CCCL_NO_VARIABLE_TEMPLATES +_CCCL_INLINE_VAR constexpr bool is_primitive_v = is_primitive::value; } // namespace detail #endif // _CCCL_DOXYGEN_INVOKED diff --git a/cub/examples/device/example_device_radix_sort_custom.cu b/cub/examples/device/example_device_radix_sort_custom.cu index d0bcf920fbb..00a892f40f0 100644 --- a/cub/examples/device/example_device_radix_sort_custom.cu +++ b/cub/examples/device/example_device_radix_sort_custom.cu @@ -114,10 +114,10 @@ int main() std::cout << "l:\t" << to_binary_representation(l) << '\n'; std::cout << "g:\t" << to_binary_representation(g) << "\n\n"; - std::cout << "As you can see, `l` key happened to be larger in the bit-lexicographicl order.\n"; - std::cout << "Since there's no reflection in C++, we can't inspect the type and convert \n"; - std::cout << "each field into the bit-lexicographicl order. You can tell CUB how to do that\n"; - std::cout << "by specializing cub::RadixTraits for the `custom_t`:\n\n"; + std::cout << "As you can see, `l` key happened to be larger in the bit-lexicographical order.\n"; + std::cout << "Since there's no reflection in C++ (yet), we can't inspect the type and convert \n"; + std::cout << "each field into the bit-lexicographical order. You can tell CUB how to do that\n"; + std::cout << "by providing a decomposer for the `custom_t`:\n\n"; std::cout << "\tstruct decomposer_t \n"; std::cout << "\t{\n"; @@ -132,7 +132,7 @@ int main() std::cout << "Decomposer allows you to specify which fields are most significant and which\n"; std::cout << "are least significant. In our case, `f` is the most significant field and\n"; std::cout << "`i` is the least significant field. The decomposer is then used by CUB to convert\n"; - std::cout << "the `custom_t` into the bit-lexicographicl order:\n\n"; + std::cout << "the `custom_t` into the bit-lexicographical order:\n\n"; using conversion_policy = cub::detail::radix::traits_t::bit_ordered_conversion_policy; l = conversion_policy::to_bit_ordered(decomposer_t{}, l); @@ -148,7 +148,7 @@ int main() std::cout << "g:\t" << to_binary_representation(g) << "\n\n"; std::cout << '\n'; - std::cout << "As you can see, `g` is now actually larger than `l` in the bit-lexicographicl order.\n"; + std::cout << "As you can see, `g` is now actually larger than `l` in the bit-lexicographical order.\n"; std::cout << "After binning, CUB is able to restore the original key:\n\n"; l = conversion_policy::from_bit_ordered(decomposer_t{}, l); diff --git a/cub/test/catch2_test_device_reduce.cuh b/cub/test/catch2_test_device_reduce.cuh index 5760aa93536..a33a2a02439 100644 --- a/cub/test/catch2_test_device_reduce.cuh +++ b/cub/test/catch2_test_device_reduce.cuh @@ -114,29 +114,6 @@ CUB_NAMESPACE_END CUB_NAMESPACE_BEGIN -// TODO(bgruber): drop this when we drop cub::Traits -template