From a15fd10bb0702765f68f59f0a9ed7540f9b2d3e0 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 17 Feb 2025 17:38:50 +0100
Subject: [PATCH] Replace cub::Traits by numeric_limits and deprecate

* Consistently use ::cuda::std::numeric_limits in CUB

Fixes: #3381
---
 c2h/include/c2h/bfloat16.cuh                  |  22 ++-
 c2h/include/c2h/half.cuh                      |  22 ++-
 cub/cub/agent/agent_sub_warp_merge_sort.cuh   |   2 +-
 cub/cub/block/radix_rank_sort_operations.cuh  |  36 ++--
 cub/cub/util_type.cuh                         | 159 +++++++++++++++---
 .../warp/specializations/warp_scan_shfl.cuh   |   1 +
 cub/test/catch2_radix_sort_helper.cuh         |   8 +-
 cub/test/catch2_test_block_radix_sort.cu      |   3 +-
 cub/test/catch2_test_device_histogram.cu      |   5 +-
 .../catch2_test_device_radix_sort_keys.cu     |   2 +-
 cub/test/test_device_batch_copy.cu            |   6 +-
 cub/test/test_util.h                          |   2 +-
 .../include/cuda/std/__utility/typeid.h       |   1 -
 13 files changed, 199 insertions(+), 70 deletions(-)

diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh
index de7f57158f6..2754ed1ea63 100644
--- a/c2h/include/c2h/bfloat16.cuh
+++ b/c2h/include/c2h/bfloat16.cuh
@@ -212,6 +212,10 @@ struct bfloat16_t
   }
 };
 
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
 /******************************************************************************
  * I/O stream overloads
  ******************************************************************************/
@@ -230,7 +234,7 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
 }
 
 /******************************************************************************
- * Traits overloads
+ * traits and limits
  ******************************************************************************/
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -238,10 +242,8 @@ template <>
 struct __is_extended_floating_point<bfloat16_t> : true_type
 {};
 
-#ifndef _CCCL_NO_VARIABLE_TEMPLATES
 template <>
 _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<bfloat16_t> = true;
-#endif // _CCCL_NO_VARIABLE_TEMPLATES
 
 template <>
 class numeric_limits<bfloat16_t>
@@ -267,10 +269,12 @@ public:
 _LIBCUDACXX_END_NAMESPACE_STD
 
 template <>
-struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
-    : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, unsigned short, bfloat16_t>
-{};
+struct CUB_NS_QUALIFIER::detail::unsigned_bits<bfloat16_t, void>
+{
+  using type = unsigned short;
+};
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic pop
-#endif
+// template <>
+// struct CUB_NS_QUALIFIER::detail::NumericTraits<bfloat16_t>
+//     : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, true, unsigned short, bfloat16_t>
+// {};
diff --git a/c2h/include/c2h/half.cuh b/c2h/include/c2h/half.cuh
index b29f3104a84..e49fbb35900 100644
--- a/c2h/include/c2h/half.cuh
+++ b/c2h/include/c2h/half.cuh
@@ -307,6 +307,10 @@ struct half_t
   }
 };
 
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
 /******************************************************************************
  * I/O stream overloads
  ******************************************************************************/
@@ -325,7 +329,7 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x)
 }
 
 /******************************************************************************
- * Traits overloads
+ * traits and limits
  ******************************************************************************/
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -333,10 +337,8 @@ template <>
 struct __is_extended_floating_point<half_t> : true_type
 {};
 
-#ifndef _CCCL_NO_VARIABLE_TEMPLATES
 template <>
 _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<half_t> = true;
-#endif // _CCCL_NO_VARIABLE_TEMPLATES
 
 template <>
 class numeric_limits<half_t>
@@ -362,10 +364,12 @@ public:
 _LIBCUDACXX_END_NAMESPACE_STD
 
 template <>
-struct CUB_NS_QUALIFIER::NumericTraits<half_t>
-    : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, unsigned short, half_t>
-{};
+struct CUB_NS_QUALIFIER::detail::unsigned_bits<half_t, void>
+{
+  using type = unsigned short;
+};
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic pop
-#endif
+// template <>
+// struct CUB_NS_QUALIFIER::detail::NumericTraits<half_t>
+//     : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, true, unsigned short, half_t>
+// {};
diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
index 7b5687ea504..a385d8c22f3 100644
--- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh
+++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -161,7 +161,7 @@ class AgentSubWarpSort
 
   _CCCL_DEVICE static bool get_oob_default(::cuda::std::true_type /* is bool */)
   {
-    // Traits<KeyT>::MAX_KEY for `bool` is 0xFF which is different from `true` and makes
+    // key_traits<KeyT>::max_key for `bool` is 0xFF which is different from `true` and makes
     // comparison with oob unreliable.
     return !IS_DESCENDING;
   }
diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh
index aea8134632d..e6accb5ed28 100644
--- a/cub/cub/block/radix_rank_sort_operations.cuh
+++ b/cub/cub/block/radix_rank_sort_operations.cuh
@@ -53,7 +53,7 @@
 #include <cuda/std/__algorithm/min.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/tuple>
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 #include <cuda/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -77,8 +77,7 @@ CUB_NAMESPACE_BEGIN
 template <typename KeyT, bool IsFP = ::cuda::is_floating_point_v<KeyT>>
 struct BaseDigitExtractor
 {
-  using TraitsT      = Traits<KeyT>;
-  using UnsignedBits = typename TraitsT::UnsignedBits;
+  using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
   {
@@ -89,14 +88,13 @@ struct BaseDigitExtractor
 template <typename KeyT>
 struct BaseDigitExtractor<KeyT, true>
 {
-  using TraitsT      = Traits<KeyT>;
-  using UnsignedBits = typename TraitsT::UnsignedBits;
+  using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
   {
     UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
-      TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
-    UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
+      key_traits<KeyT>::twiddle_in(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
+    UnsignedBits TWIDDLED_ZERO_BITS = key_traits<KeyT>::twiddle_in(0);
     return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
   }
 };
@@ -209,7 +207,7 @@ struct is_fundamental_type
 };
 
 template <class T>
-struct is_fundamental_type<T, ::cuda::std::void_t<typename Traits<T>::UnsignedBits>>
+struct is_fundamental_type<T, ::cuda::std::void_t<typename key_traits<T>::unsigned_bits>>
 {
   static constexpr bool value = true;
 };
@@ -233,23 +231,23 @@ using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_
 template <class T>
 struct bit_ordered_conversion_policy_t
 {
-  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type = typename key_traits<T>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
   {
-    return Traits<T>::TwiddleIn(val);
+    return key_traits<T>::twiddle_in(val);
   }
 
   static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
   {
-    return Traits<T>::TwiddleOut(val);
+    return key_traits<T>::twiddle_out(val);
   }
 };
 
 template <class T>
 struct bit_ordered_inversion_policy_t
 {
-  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type = typename key_traits<T>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val)
   {
@@ -260,7 +258,7 @@ struct bit_ordered_inversion_policy_t
 template <class T, bool = is_fundamental_type<T>::value>
 struct traits_t
 {
-  using bit_ordered_type              = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type              = typename key_traits<T>::unsigned_bits;
   using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t<T>;
   using bit_ordered_inversion_policy  = bit_ordered_inversion_policy_t<T>;
 
@@ -269,12 +267,20 @@ struct traits_t
 
   static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(detail::identity_decomposer_t)
   {
-    return Traits<T>::LOWEST_KEY;
+    // TODO(bgruber): sanity check, remove eventually
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
+    static_assert(key_traits<T>::lowest_key == Traits<T>::LOWEST_KEY, "");
+    _CCCL_SUPPRESS_DEPRECATED_POP
+    return key_traits<T>::lowest_key;
   }
 
   static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(detail::identity_decomposer_t)
   {
-    return Traits<T>::MAX_KEY;
+    // TODO(bgruber): sanity check, remove eventually
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
+    static_assert(key_traits<T>::max_key == Traits<T>::MAX_KEY, "");
+    _CCCL_SUPPRESS_DEPRECATED_POP
+    return key_traits<T>::max_key;
   }
 
   static _CCCL_HOST_DEVICE int default_end_bit(detail::identity_decomposer_t)
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index 04d0062f048..1c71951c1d7 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -50,7 +50,7 @@
 
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 
 #if _CCCL_HAS_NVFP16()
 #  include <cuda_fp16.h>
@@ -765,13 +765,8 @@ struct BinaryOpHasIdxParam<T,
                              ::cuda::std::declval<T>(), ::cuda::std::declval<T>(), int{}))>> : ::cuda::std::true_type
 {};
 
-/******************************************************************************
- * Simple type traits utilities.
- ******************************************************************************/
-
-/**
- * \brief Basic type traits categories
- */
+namespace detail
+{
 enum Category
 {
   NOT_A_NUMBER,
@@ -780,8 +775,6 @@ enum Category
   FLOATING_POINT
 };
 
-namespace detail
-{
 struct is_primitive_impl;
 
 template <Category _CATEGORY, bool _PRIMITIVE, typename _UnsignedBits, typename T>
@@ -930,11 +923,18 @@ private:
 };
 } // namespace detail
 
+using Category CCCL_DEPRECATED_BECAUSE("Use <cuda/std/type_traits> instead") = detail::Category;
+
+_CCCL_SUPPRESS_DEPRECATED_PUSH
 //! Use this class as base when specializing \ref NumericTraits for primitive signed/unsigned integers or floating-point
 //! types.
 template <Category _CATEGORY, bool _PRIMITIVE, typename _UnsignedBits, typename T>
-using BaseTraits = detail::BaseTraits<_CATEGORY, _PRIMITIVE, _UnsignedBits, T>;
+using BaseTraits CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits and cuda::is_floating_point etc. instead") =
+  detail::BaseTraits<_CATEGORY, _PRIMITIVE, _UnsignedBits, T>;
+_CCCL_SUPPRESS_DEPRECATED_POP
 
+namespace detail
+{
 //! Numeric type traits for radix sort key operations, decoupled lookback and tuning. You can specialize this template
 //! for your own types if:
 //! * There is an unsigned integral type of equal size
@@ -1047,31 +1047,32 @@ private:
 template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, unsigned int, float> {};
 template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, unsigned long long, double> {};
 #  if _CCCL_HAS_NVFP16()
-    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, unsigned short, __half> {};
+template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, unsigned short, __half> {};
 #  endif // _CCCL_HAS_NVFP16()
 #  if _CCCL_HAS_NVBF16()
-    template <> struct NumericTraits<__nv_bfloat16> :   BaseTraits<FLOATING_POINT, true, unsigned short, __nv_bfloat16> {};
+template <> struct NumericTraits<__nv_bfloat16> :   BaseTraits<FLOATING_POINT, true, unsigned short, __nv_bfloat16> {};
 #  endif // _CCCL_HAS_NVBF16()
 
 #if _CCCL_HAS_NVFP8()
-    template <> struct NumericTraits<__nv_fp8_e4m3> :   BaseTraits<FLOATING_POINT, true, __nv_fp8_storage_t, __nv_fp8_e4m3> {};
-    template <> struct NumericTraits<__nv_fp8_e5m2> :   BaseTraits<FLOATING_POINT, true, __nv_fp8_storage_t, __nv_fp8_e5m2> {};
+template <> struct NumericTraits<__nv_fp8_e4m3> :   BaseTraits<FLOATING_POINT, true, __nv_fp8_storage_t, __nv_fp8_e4m3> {};
+template <> struct NumericTraits<__nv_fp8_e5m2> :   BaseTraits<FLOATING_POINT, true, __nv_fp8_storage_t, __nv_fp8_e5m2> {};
 #endif // _CCCL_HAS_NVFP8()
 
 template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, typename UnitWord<bool>::VolatileWord, bool> {};
 // clang-format on
 
-namespace detail
-{
 template <typename T>
 struct Traits : NumericTraits<::cuda::std::remove_cv_t<T>>
 {};
 } // namespace detail
 
-//! \brief Query type traits for radix sort key operations, decoupled lookback and tunings. To add support for your own
-//! primitive types please specialize \ref NumericTraits.
 template <typename T>
-using Traits = detail::Traits<T>;
+using NumericTraits CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits and cuda::is_floating_point etc. instead") =
+  detail::NumericTraits<T>;
+
+template <typename T>
+using Traits
+  CCCL_DEPRECATED_BECAUSE("Use cuda::std::numeric_limits and cuda::is_floating_point etc. instead") = detail::Traits<T>;
 
 namespace detail
 {
@@ -1096,8 +1097,124 @@ struct is_primitive : is_primitive_impl::is_primitive<T>
 
 template <typename T>
 _CCCL_INLINE_VAR constexpr bool is_primitive_v = is_primitive<T>::value;
+
+//! Trait to get an unsigned integral type with the same size as T, exposed as a nested alias ::type.
+template <typename T>
+struct unsigned_bits
+{
+  template <typename U>
+  static auto h()
+  {
+    if constexpr (sizeof(U) == 1)
+    {
+      return uint8_t{};
+    }
+    else if constexpr (sizeof(U) == 2)
+    {
+      return uint16_t{};
+    }
+    else if constexpr (sizeof(U) == 4)
+    {
+      return uint32_t{};
+    }
+    else if constexpr (sizeof(U) == 8)
+    {
+      return uint64_t{};
+    }
+#  if _CCCL_HAS_INT128()
+    else if constexpr (sizeof(U) == 16)
+    {
+      return __uint128_t{};
+    }
+    else
+    {
+      static_assert(!sizeof(U), "No unsigned type for this T");
+    }
+  }
+
+  using type = decltype(h<T>());
+};
+
+//! Alias to an unsigned integral type with the same size as T.
+template <typename T>
+using unsigned_bits_t = typename unsigned_bits<T>::type;
 } // namespace detail
 
-#endif // _CCCL_DOXYGEN_INVOKED
+// TODO(bgruber): find a better name for key_traits
+template <typename T, typename SFINAE = void>
+struct key_traits;
+
+template <typename T>
+struct key_traits<T,
+                  ::cuda::std::enable_if_t<(::cuda::std::is_integral<T>::value && ::cuda::std::is_unsigned<T>::value)
+#    if _CCCL_HAS_INT128()
+                                           || ::cuda::std::is_same<T, __uint128_t>::value
+#    endif // _CCCL_HAS_INT128()
+                                           >>
+{
+  using unsigned_bits                       = detail::unsigned_bits_t<T>;
+  static constexpr unsigned_bits lowest_key = unsigned_bits(0);
+  static constexpr unsigned_bits max_key    = unsigned_bits(-1);
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_in(unsigned_bits key)
+  {
+    return key;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_out(unsigned_bits key)
+  {
+    return key;
+  }
+};
+
+// test whether key_traits<bool> is a complete type (so bool is handled as unsigned above)
+static_assert(sizeof(key_traits<bool>), "");
+
+template <typename T>
+struct key_traits<T,
+                  ::cuda::std::enable_if_t<(::cuda::std::is_integral<T>::value && ::cuda::std::is_signed<T>::value)
+#    if _CCCL_HAS_INT128()
+                                           || ::cuda::std::is_same<T, __int128_t>::value
+#    endif // _CCCL_HAS_INT128()
+                                           >>
+{
+  using unsigned_bits                       = detail::unsigned_bits_t<T>;
+  static constexpr unsigned_bits high_bit   = unsigned_bits(1) << ((sizeof(unsigned_bits) * CHAR_BIT) - 1);
+  static constexpr unsigned_bits lowest_key = high_bit;
+  static constexpr unsigned_bits max_key    = unsigned_bits(-1) ^ lowest_key;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_in(unsigned_bits key)
+  {
+    return key ^ high_bit;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_out(unsigned_bits key)
+  {
+    return key ^ high_bit;
+  }
+};
+
+template <typename T>
+struct key_traits<T, ::cuda::std::enable_if_t<::cuda::is_floating_point<T>::value>>
+{
+  using unsigned_bits                       = detail::unsigned_bits_t<T>;
+  static constexpr unsigned_bits high_bit   = unsigned_bits(1) << ((sizeof(unsigned_bits) * CHAR_BIT) - 1);
+  static constexpr unsigned_bits lowest_key = unsigned_bits(-1);
+  static constexpr unsigned_bits max_key    = unsigned_bits(-1) ^ high_bit;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_in(unsigned_bits key)
+  {
+    const unsigned_bits mask = (key & high_bit) ? unsigned_bits(-1) : high_bit;
+    return key ^ mask;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_bits twiddle_out(unsigned_bits key)
+  {
+    const unsigned_bits mask = (key & high_bit) ? high_bit : unsigned_bits(-1);
+    return key ^ mask;
+  }
+};
+
+#  endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index 201360c4cd0..dedf8249a69 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -49,6 +49,7 @@
 #include <cub/util_type.cuh>
 
 #include <cuda/ptx>
+#include <cuda/std/type_traits>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh
index 821927a0c05..ac196617843 100644
--- a/cub/test/catch2_radix_sort_helper.cuh
+++ b/cub/test/catch2_radix_sort_helper.cuh
@@ -218,8 +218,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in
   c2h::host_vector<KeyT> h_striped_keys(h_keys);
   KeyT* h_striped_keys_data = thrust::raw_pointer_cast(h_striped_keys.data());
 
-  using traits_t      = cub::Traits<KeyT>;
-  using bit_ordered_t = typename traits_t::UnsignedBits;
+  using bit_ordered_t = typename cub::key_traits<KeyT>::unsigned_bits;
 
   const int num_bits = end_bit - begin_bit;
 
@@ -237,7 +236,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in
       }
     }
 
-    key = traits_t::TwiddleIn(key);
+    key = cub::key_traits<KeyT>::twiddle_in(key);
 
     if ((begin_bit > 0) || (end_bit < static_cast<int>(sizeof(KeyT) * 8)))
     {
@@ -291,8 +290,7 @@ c2h::host_vector<std::size_t> get_permutation(
   c2h::host_vector<std::size_t> h_permutation(h_keys.size());
   thrust::sequence(h_permutation.begin(), h_permutation.end());
 
-  using traits_t      = cub::Traits<KeyT>;
-  using bit_ordered_t = typename traits_t::UnsignedBits;
+  using bit_ordered_t = typename cub::key_traits<KeyT>::unsigned_bits;
 
   auto bit_ordered_striped_keys =
     reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(h_striped_keys.data()));
diff --git a/cub/test/catch2_test_block_radix_sort.cu b/cub/test/catch2_test_block_radix_sort.cu
index b952799af7d..3d859ab34e9 100644
--- a/cub/test/catch2_test_block_radix_sort.cu
+++ b/cub/test/catch2_test_block_radix_sort.cu
@@ -104,8 +104,7 @@ bool binary_equal(
 {
   d_tmp = h_reference;
 
-  using traits_t      = cub::Traits<T>;
-  using bit_ordered_t = typename traits_t::UnsignedBits;
+  using bit_ordered_t = typename cub::key_traits<T>::unsigned_bits;
 
   auto d_output_ptr    = reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(d_output.data()));
   auto d_reference_ptr = reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(d_tmp.data()));
diff --git a/cub/test/catch2_test_device_histogram.cu b/cub/test/catch2_test_device_histogram.cu
index 6f9ada49d37..dbb5efcc6ed 100644
--- a/cub/test/catch2_test_device_histogram.cu
+++ b/cub/test/catch2_test_device_histogram.cu
@@ -27,13 +27,14 @@
  ******************************************************************************/
 
 #include <cub/device/device_histogram.cuh>
+#include <cub/util_type.cuh>
 
 #include <thrust/iterator/counting_iterator.h>
 
 #include <cuda/std/__algorithm_>
 #include <cuda/std/array>
 #include <cuda/std/bit>
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 #include <cuda/type_traits>
 
 #include <algorithm>
@@ -213,7 +214,7 @@ struct bit_and_anything
   template <typename T>
   _CCCL_HOST_DEVICE auto operator()(const T& a, const T& b) const -> T
   {
-    using U = typename cub::Traits<T>::UnsignedBits;
+    using U = typename cub::detail::unsigned_bits_t<T>;
     return ::cuda::std::bit_cast<T>(static_cast<U>(::cuda::std::bit_cast<U>(a) & ::cuda::std::bit_cast<U>(b)));
   }
 };
diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu
index 45105cf0957..213618213f0 100644
--- a/cub/test/catch2_test_device_radix_sort_keys.cu
+++ b/cub/test/catch2_test_device_radix_sort_keys.cu
@@ -190,7 +190,7 @@ C2H_TEST("DeviceRadixSort::SortKeys: bit windows", "[keys][radix][sort][device]"
 C2H_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sort][device]", fp_key_types)
 {
   using key_t  = c2h::get<0, TestType>;
-  using bits_t = typename cub::Traits<key_t>::UnsignedBits;
+  using bits_t = typename cub::key_traits<key_t>::unsigned_bits;
 
   constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT;
   const key_t positive_zero      = ::cuda::std::bit_cast<key_t>(bits_t(0));
diff --git a/cub/test/test_device_batch_copy.cu b/cub/test/test_device_batch_copy.cu
index 70da5de9d08..9340e59e771 100644
--- a/cub/test/test_device_batch_copy.cu
+++ b/cub/test/test_device_batch_copy.cu
@@ -55,9 +55,9 @@ template <typename T>
 void GenerateRandomData(
   T* rand_out,
   const std::size_t num_items,
-  const T min_rand_val                                         = ::cuda::std::numeric_limits<T>::min(),
-  const T max_rand_val                                         = ::cuda::std::numeric_limits<T>::max(),
-  const std::uint_fast32_t seed                                = 320981U,
+  const T min_rand_val          = ::cuda::std::numeric_limits<T>::min(),
+  const T max_rand_val          = ::cuda::std::numeric_limits<T>::max(),
+  const std::uint_fast32_t seed = 320981U,
   std::enable_if_t<std::is_integral_v<T> && (sizeof(T) >= 2)>* = nullptr)
 {
   // initialize random number generator
diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index 88552be32ac..e153d3d620f 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -580,7 +580,7 @@ __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, s
       case RANDOM_MINUS_PLUS_ZERO: {
         // Replace roughly 1/128 of values with -0.0 or +0.0, and
         // generate the rest randomly
-        using UnsignedBits = typename CUB_NS_QUALIFIER::Traits<T>::UnsignedBits;
+        using UnsignedBits = CUB_NS_QUALIFIER::detail::unsigned_bits_t<T>;
         char c;
         RandomBits(c);
         if (c == 0)
diff --git a/libcudacxx/include/cuda/std/__utility/typeid.h b/libcudacxx/include/cuda/std/__utility/typeid.h
index ff6b6e32e31..11103834373 100644
--- a/libcudacxx/include/cuda/std/__utility/typeid.h
+++ b/libcudacxx/include/cuda/std/__utility/typeid.h
@@ -40,7 +40,6 @@
 #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 #  include <cuda/std/compare>
 #endif
-#include <cuda/std/__cccl/preprocessor.h>
 #include <cuda/std/__string/string_view.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>