From a15fd10bb0702765f68f59f0a9ed7540f9b2d3e0 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 17 Feb 2025 17:38:50 +0100 Subject: [PATCH] Replace cub::Traits by numeric_limits and deprecate * Consistently use ::cuda::std::numeric_limits in CUB Fixes: #3381 --- c2h/include/c2h/bfloat16.cuh | 22 ++- c2h/include/c2h/half.cuh | 22 ++- cub/cub/agent/agent_sub_warp_merge_sort.cuh | 2 +- cub/cub/block/radix_rank_sort_operations.cuh | 36 ++-- cub/cub/util_type.cuh | 159 +++++++++++++++--- .../warp/specializations/warp_scan_shfl.cuh | 1 + cub/test/catch2_radix_sort_helper.cuh | 8 +- cub/test/catch2_test_block_radix_sort.cu | 3 +- cub/test/catch2_test_device_histogram.cu | 5 +- .../catch2_test_device_radix_sort_keys.cu | 2 +- cub/test/test_device_batch_copy.cu | 6 +- cub/test/test_util.h | 2 +- .../include/cuda/std/__utility/typeid.h | 1 - 13 files changed, 199 insertions(+), 70 deletions(-) diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh index de7f57158f6..2754ed1ea63 100644 --- a/c2h/include/c2h/bfloat16.cuh +++ b/c2h/include/c2h/bfloat16.cuh @@ -212,6 +212,10 @@ struct bfloat16_t } }; +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif + /****************************************************************************** * I/O stream overloads ******************************************************************************/ @@ -230,7 +234,7 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x) } /****************************************************************************** - * Traits overloads + * traits and limits ******************************************************************************/ _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -238,10 +242,8 @@ template <> struct __is_extended_floating_point : true_type {}; -#ifndef _CCCL_NO_VARIABLE_TEMPLATES template <> _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v = true; -#endif // _CCCL_NO_VARIABLE_TEMPLATES template <> class numeric_limits @@ -267,10 +269,12 @@ public: _LIBCUDACXX_END_NAMESPACE_STD template <> -struct CUB_NS_QUALIFIER::NumericTraits - : CUB_NS_QUALIFIER::BaseTraits -{}; +struct CUB_NS_QUALIFIER::detail::unsigned_bits +{ + using type = unsigned short; +}; -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif +// template <> +// struct CUB_NS_QUALIFIER::detail::NumericTraits +// : CUB_NS_QUALIFIER::detail::BaseTraits +// {}; diff --git a/c2h/include/c2h/half.cuh b/c2h/include/c2h/half.cuh index b29f3104a84..e49fbb35900 100644 --- a/c2h/include/c2h/half.cuh +++ b/c2h/include/c2h/half.cuh @@ -307,6 +307,10 @@ struct half_t } }; +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif + /****************************************************************************** * I/O stream overloads ******************************************************************************/ @@ -325,7 +329,7 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x) } /****************************************************************************** - * Traits overloads + * traits and limits ******************************************************************************/ _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -333,10 +337,8 @@ template <> struct __is_extended_floating_point : true_type {}; -#ifndef _CCCL_NO_VARIABLE_TEMPLATES template <> _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v = true; -#endif // _CCCL_NO_VARIABLE_TEMPLATES template <> class numeric_limits @@ -362,10 +364,12 @@ public: _LIBCUDACXX_END_NAMESPACE_STD template <> -struct CUB_NS_QUALIFIER::NumericTraits - : CUB_NS_QUALIFIER::BaseTraits -{}; +struct CUB_NS_QUALIFIER::detail::unsigned_bits +{ + using type = unsigned short; +}; -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif +// template <> +// struct CUB_NS_QUALIFIER::detail::NumericTraits +// : CUB_NS_QUALIFIER::detail::BaseTraits +// {}; diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh index 7b5687ea504..a385d8c22f3 100644 --- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh @@ -161,7 +161,7 @@ class AgentSubWarpSort _CCCL_DEVICE static bool get_oob_default(::cuda::std::true_type /* is bool */) { - // Traits::MAX_KEY for `bool` is 0xFF which is different from `true` and makes + // key_traits::max_key for `bool` is 0xFF which is different from `true` and makes // comparison with oob unreliable. return !IS_DESCENDING; } diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh index aea8134632d..e6accb5ed28 100644 --- a/cub/cub/block/radix_rank_sort_operations.cuh +++ b/cub/cub/block/radix_rank_sort_operations.cuh @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include CUB_NAMESPACE_BEGIN @@ -77,8 +77,7 @@ CUB_NAMESPACE_BEGIN template > struct BaseDigitExtractor { - using TraitsT = Traits; - using UnsignedBits = typename TraitsT::UnsignedBits; + using UnsignedBits = typename key_traits::unsigned_bits; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key) { @@ -89,14 +88,13 @@ struct BaseDigitExtractor template struct BaseDigitExtractor { - using TraitsT = Traits; - using UnsignedBits = typename TraitsT::UnsignedBits; + using UnsignedBits = typename key_traits::unsigned_bits; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key) { UnsignedBits TWIDDLED_MINUS_ZERO_BITS = - TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1)); - UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0); + key_traits::twiddle_in(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1)); + UnsignedBits TWIDDLED_ZERO_BITS = key_traits::twiddle_in(0); return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key; } }; @@ -209,7 +207,7 @@ struct is_fundamental_type }; template -struct is_fundamental_type::UnsignedBits>> +struct is_fundamental_type::unsigned_bits>> { static constexpr bool value = true; }; @@ -233,23 +231,23 @@ using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t struct bit_ordered_conversion_policy_t { - using bit_ordered_type = typename Traits::UnsignedBits; + using bit_ordered_type = typename key_traits::unsigned_bits; static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val) { - return Traits::TwiddleIn(val); + return key_traits::twiddle_in(val); } static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val) { - return Traits::TwiddleOut(val); + return key_traits::twiddle_out(val); } }; template struct bit_ordered_inversion_policy_t { - using bit_ordered_type = typename Traits::UnsignedBits; + using bit_ordered_type = typename key_traits::unsigned_bits; static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val) { @@ -260,7 +258,7 @@ struct bit_ordered_inversion_policy_t template ::value> struct traits_t { - using bit_ordered_type = typename Traits::UnsignedBits; + using bit_ordered_type = typename key_traits::unsigned_bits; using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t; using bit_ordered_inversion_policy = bit_ordered_inversion_policy_t; @@ -269,12 +267,20 @@ struct traits_t static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(detail::identity_decomposer_t) { - return Traits::LOWEST_KEY; + // TODO(bgruber): sanity check, remove eventually + _CCCL_SUPPRESS_DEPRECATED_PUSH + static_assert(key_traits::lowest_key == Traits::LOWEST_KEY, ""); + _CCCL_SUPPRESS_DEPRECATED_POP + return key_traits::lowest_key; } static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(detail::identity_decomposer_t) { - return Traits::MAX_KEY; + // TODO(bgruber): sanity check, remove eventually + _CCCL_SUPPRESS_DEPRECATED_PUSH + static_assert(key_traits::max_key == Traits::MAX_KEY, ""); + _CCCL_SUPPRESS_DEPRECATED_POP + return key_traits::max_key; } static _CCCL_HOST_DEVICE int default_end_bit(detail::identity_decomposer_t) diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 04d0062f048..1c71951c1d7 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -50,7 +50,7 @@ #include #include -#include +#include #if _CCCL_HAS_NVFP16() # include @@ -765,13 +765,8 @@ struct BinaryOpHasIdxParam(), ::cuda::std::declval(), int{}))>> : ::cuda::std::true_type {}; -/****************************************************************************** - * Simple type traits utilities. - ******************************************************************************/ - -/** - * \brief Basic type traits categories - */ +namespace detail +{ enum Category { NOT_A_NUMBER, @@ -780,8 +775,6 @@ enum Category FLOATING_POINT }; -namespace detail -{ struct is_primitive_impl; template @@ -930,11 +923,18 @@ private: }; } // namespace detail +using Category CCCL_DEPRECATED_BECAUSE("Use instead") = detail::Category; + +_CCCL_SUPPRESS_DEPRECATED_PUSH //! Use this class as base when specializing \ref NumericTraits for primitive signed/unsigned integers or floating-point //! types. template -using BaseTraits = detail::BaseTraits<_CATEGORY, _PRIMITIVE, _UnsignedBits, T>; +using BaseTraits CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits and cuda::is_floating_point etc. instead") = + detail::BaseTraits<_CATEGORY, _PRIMITIVE, _UnsignedBits, T>; +_CCCL_SUPPRESS_DEPRECATED_POP +namespace detail +{ //! Numeric type traits for radix sort key operations, decoupled lookback and tuning. You can specialize this template //! for your own types if: //! * There is an unsigned integral type of equal size @@ -1047,31 +1047,32 @@ private: template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; # if _CCCL_HAS_NVFP16() - template <> struct NumericTraits<__half> : BaseTraits {}; +template <> struct NumericTraits<__half> : BaseTraits {}; # endif // _CCCL_HAS_NVFP16() # if _CCCL_HAS_NVBF16() - template <> struct NumericTraits<__nv_bfloat16> : BaseTraits {}; +template <> struct NumericTraits<__nv_bfloat16> : BaseTraits {}; # endif // _CCCL_HAS_NVBF16() #if _CCCL_HAS_NVFP8() - template <> struct NumericTraits<__nv_fp8_e4m3> : BaseTraits {}; - template <> struct NumericTraits<__nv_fp8_e5m2> : BaseTraits {}; +template <> struct NumericTraits<__nv_fp8_e4m3> : BaseTraits {}; +template <> struct NumericTraits<__nv_fp8_e5m2> : BaseTraits {}; #endif // _CCCL_HAS_NVFP8() template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; // clang-format on -namespace detail -{ template struct Traits : NumericTraits<::cuda::std::remove_cv_t> {}; } // namespace detail -//! \brief Query type traits for radix sort key operations, decoupled lookback and tunings. To add support for your own -//! primitive types please specialize \ref NumericTraits. template -using Traits = detail::Traits; +using NumericTraits CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits and cuda::is_floating_point etc. instead") = + detail::NumericTraits; + +template +using Traits + CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits and cuda::is_floating_point etc. instead") = detail::Traits; namespace detail { @@ -1096,8 +1097,124 @@ struct is_primitive : is_primitive_impl::is_primitive template _CCCL_INLINE_VAR constexpr bool is_primitive_v = is_primitive::value; + +//! Trait to get an unsigned integral type with the same size as T, exposed as a nested alias ::type. +template +struct unsigned_bits +{ + template + static auto h() + { + if constexpr (sizeof(U) == 1) + { + return uint8_t{}; + } + else if constexpr (sizeof(U) == 2) + { + return uint16_t{}; + } + else if constexpr (sizeof(U) == 4) + { + return uint32_t{}; + } + else if constexpr (sizeof(U) == 8) + { + return uint64_t{}; + } +# if _CCCL_HAS_INT128() + else if constexpr (sizeof(U) == 16) + { + return __uint128_t{}; + } + else + { + static_assert(!sizeof(U), "No unsigned type for this T"); + } + } + + using type = decltype(h()); +}; + +//! Alias to an unsigned integral type with the same size as T. +template +using unsigned_bits_t = typename unsigned_bits::type; } // namespace detail -#endif // _CCCL_DOXYGEN_INVOKED +// TODO(bgruber): find a better name for key_traits +template +struct key_traits; + +template +struct key_traits::value && ::cuda::std::is_unsigned::value) +# if _CCCL_HAS_INT128() + || ::cuda::std::is_same::value +# endif // _CCCL_HAS_INT128() + >> +{ + using unsigned_bits = detail::unsigned_bits_t; + static constexpr unsigned_bits lowest_key = unsigned_bits(0); + static constexpr unsigned_bits max_key = unsigned_bits(-1); + + static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_in(unsigned_bits key) + { + return key; + } + + static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_out(unsigned_bits key) + { + return key; + } +}; + +// test whether key_traits is a complete type (so bool is handled as unsigned above) +static_assert(sizeof(key_traits), ""); + +template +struct key_traits::value && ::cuda::std::is_signed::value) +# if _CCCL_HAS_INT128() + || ::cuda::std::is_same::value +# endif // _CCCL_HAS_INT128() + >> +{ + using unsigned_bits = detail::unsigned_bits_t; + static constexpr unsigned_bits high_bit = unsigned_bits(1) << ((sizeof(unsigned_bits) * CHAR_BIT) - 1); + static constexpr unsigned_bits lowest_key = high_bit; + static constexpr unsigned_bits max_key = unsigned_bits(-1) ^ lowest_key; + + static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_in(unsigned_bits key) + { + return key ^ high_bit; + } + + static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_out(unsigned_bits key) + { + return key ^ high_bit; + } +}; + +template +struct key_traits::value>> +{ + using unsigned_bits = detail::unsigned_bits_t; + static constexpr unsigned_bits high_bit = unsigned_bits(1) << ((sizeof(unsigned_bits) * CHAR_BIT) - 1); + static constexpr unsigned_bits lowest_key = unsigned_bits(-1); + static constexpr unsigned_bits max_key = unsigned_bits(-1) ^ high_bit; + + static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_in(unsigned_bits key) + { + const unsigned_bits mask = (key & high_bit) ? unsigned_bits(-1) : high_bit; + return key ^ mask; + }; + + static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_out(unsigned_bits key) + { + const unsigned_bits mask = (key & high_bit) ? high_bit : unsigned_bits(-1); + return key ^ mask; + } +}; + +# endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index 201360c4cd0..dedf8249a69 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -49,6 +49,7 @@ #include #include +#include CUB_NAMESPACE_BEGIN namespace detail diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh index 821927a0c05..ac196617843 100644 --- a/cub/test/catch2_radix_sort_helper.cuh +++ b/cub/test/catch2_radix_sort_helper.cuh @@ -218,8 +218,7 @@ c2h::host_vector get_striped_keys(const c2h::host_vector& h_keys, in c2h::host_vector h_striped_keys(h_keys); KeyT* h_striped_keys_data = thrust::raw_pointer_cast(h_striped_keys.data()); - using traits_t = cub::Traits; - using bit_ordered_t = typename traits_t::UnsignedBits; + using bit_ordered_t = typename cub::key_traits::unsigned_bits; const int num_bits = end_bit - begin_bit; @@ -237,7 +236,7 @@ c2h::host_vector get_striped_keys(const c2h::host_vector& h_keys, in } } - key = traits_t::TwiddleIn(key); + key = cub::key_traits::twiddle_in(key); if ((begin_bit > 0) || (end_bit < static_cast(sizeof(KeyT) * 8))) { @@ -291,8 +290,7 @@ c2h::host_vector get_permutation( c2h::host_vector h_permutation(h_keys.size()); thrust::sequence(h_permutation.begin(), h_permutation.end()); - using traits_t = cub::Traits; - using bit_ordered_t = typename traits_t::UnsignedBits; + using bit_ordered_t = typename cub::key_traits::unsigned_bits; auto bit_ordered_striped_keys = reinterpret_cast(thrust::raw_pointer_cast(h_striped_keys.data())); diff --git a/cub/test/catch2_test_block_radix_sort.cu b/cub/test/catch2_test_block_radix_sort.cu index b952799af7d..3d859ab34e9 100644 --- a/cub/test/catch2_test_block_radix_sort.cu +++ b/cub/test/catch2_test_block_radix_sort.cu @@ -104,8 +104,7 @@ bool binary_equal( { d_tmp = h_reference; - using traits_t = cub::Traits; - using bit_ordered_t = typename traits_t::UnsignedBits; + using bit_ordered_t = typename cub::key_traits::unsigned_bits; auto d_output_ptr = reinterpret_cast(thrust::raw_pointer_cast(d_output.data())); auto d_reference_ptr = reinterpret_cast(thrust::raw_pointer_cast(d_tmp.data())); diff --git a/cub/test/catch2_test_device_histogram.cu b/cub/test/catch2_test_device_histogram.cu index 6f9ada49d37..dbb5efcc6ed 100644 --- a/cub/test/catch2_test_device_histogram.cu +++ b/cub/test/catch2_test_device_histogram.cu @@ -27,13 +27,14 @@ ******************************************************************************/ #include +#include #include #include #include #include -#include +#include #include #include @@ -213,7 +214,7 @@ struct bit_and_anything template _CCCL_HOST_DEVICE auto operator()(const T& a, const T& b) const -> T { - using U = typename cub::Traits::UnsignedBits; + using U = typename cub::detail::unsigned_bits_t; return ::cuda::std::bit_cast(static_cast(::cuda::std::bit_cast(a) & ::cuda::std::bit_cast(b))); } }; diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu index 45105cf0957..213618213f0 100644 --- a/cub/test/catch2_test_device_radix_sort_keys.cu +++ b/cub/test/catch2_test_device_radix_sort_keys.cu @@ -190,7 +190,7 @@ C2H_TEST("DeviceRadixSort::SortKeys: bit windows", "[keys][radix][sort][device]" C2H_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sort][device]", fp_key_types) { using key_t = c2h::get<0, TestType>; - using bits_t = typename cub::Traits::UnsignedBits; + using bits_t = typename cub::key_traits::unsigned_bits; constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT; const key_t positive_zero = ::cuda::std::bit_cast(bits_t(0)); diff --git a/cub/test/test_device_batch_copy.cu b/cub/test/test_device_batch_copy.cu index 70da5de9d08..9340e59e771 100644 --- a/cub/test/test_device_batch_copy.cu +++ b/cub/test/test_device_batch_copy.cu @@ -55,9 +55,9 @@ template void GenerateRandomData( T* rand_out, const std::size_t num_items, - const T min_rand_val = ::cuda::std::numeric_limits::min(), - const T max_rand_val = ::cuda::std::numeric_limits::max(), - const std::uint_fast32_t seed = 320981U, + const T min_rand_val = ::cuda::std::numeric_limits::min(), + const T max_rand_val = ::cuda::std::numeric_limits::max(), + const std::uint_fast32_t seed = 320981U, std::enable_if_t && (sizeof(T) >= 2)>* = nullptr) { // initialize random number generator diff --git a/cub/test/test_util.h b/cub/test/test_util.h index 88552be32ac..e153d3d620f 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -580,7 +580,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s case RANDOM_MINUS_PLUS_ZERO: { // Replace roughly 1/128 of values with -0.0 or +0.0, and // generate the rest randomly - using UnsignedBits = typename CUB_NS_QUALIFIER::Traits::UnsignedBits; + using UnsignedBits = CUB_NS_QUALIFIER::detail::unsigned_bits_t; char c; RandomBits(c); if (c == 0) diff --git a/libcudacxx/include/cuda/std/__utility/typeid.h b/libcudacxx/include/cuda/std/__utility/typeid.h index ff6b6e32e31..11103834373 100644 --- a/libcudacxx/include/cuda/std/__utility/typeid.h +++ b/libcudacxx/include/cuda/std/__utility/typeid.h @@ -40,7 +40,6 @@ #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR # include #endif -#include #include #include #include