diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu
index 20e8a3e2253..f3b7ba38675 100644
--- a/cub/benchmarks/bench/radix_sort/keys.cu
+++ b/cub/benchmarks/bench/radix_sort/keys.cu
@@ -27,6 +27,8 @@
 
 #include <cub/device/device_radix_sort.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <nvbench_helper.cuh>
 
 // %//RANGE//% TUNE_RADIX_BITS bits 8:9:1
@@ -46,7 +48,7 @@ struct policy_hub_t
 {
   static constexpr bool KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
 
-  using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
 
   struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
   {
diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu
index 074a35b9a2a..2729ce1b623 100644
--- a/cub/benchmarks/bench/radix_sort/pairs.cu
+++ b/cub/benchmarks/bench/radix_sort/pairs.cu
@@ -27,6 +27,8 @@
 
 #include <cub/device/device_radix_sort.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <nvbench_helper.cuh>
 
 // %//RANGE//% TUNE_RADIX_BITS bits 8:9:1
@@ -44,7 +46,7 @@ struct policy_hub_t
 {
   static constexpr bool KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
 
-  using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
 
   struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
   {
diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index 84385d6376d..ce204273da8 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -49,6 +49,8 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -225,9 +227,9 @@ struct AgentHistogram
   // Wrap the native input pointer with CacheModifiedInputIterator
   // or directly use the supplied input iterator type
   using WrappedSampleIteratorT =
-    cub::detail::conditional_t<std::is_pointer<SampleIteratorT>::value,
-                               CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,
-                               SampleIteratorT>;
+    ::cuda::std::_If<std::is_pointer<SampleIteratorT>::value,
+                     CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,
+                     SampleIteratorT>;
 
   /// Pixel input iterator type (for applying cache modifier)
   using WrappedPixelIteratorT = CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>;
diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index ff74b6be251..a78ee66c7b2 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -49,6 +49,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 /** \brief cub::RadixSortStoreAlgorithm enumerates different algorithms to write
@@ -146,10 +148,10 @@ struct AgentRadixSortOnesweep
                   || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
                 "for onesweep agent, the ranking algorithm must warp-strided key arrangement");
 
-  using BlockRadixRankT = cub::detail::conditional_t<
+  using BlockRadixRankT = ::cuda::std::_If<
     RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
     BlockRadixRankMatchEarlyCounts<BLOCK_THREADS, RADIX_BITS, false, SCAN_ALGORITHM, WARP_MATCH_ATOMIC_OR, RANK_NUM_PARTS>,
-    cub::detail::conditional_t<
+    ::cuda::std::_If<
       RANK_ALGORITHM == RADIX_RANK_MATCH,
       BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, false, SCAN_ALGORITHM>,
       BlockRadixRankMatchEarlyCounts<BLOCK_THREADS, RADIX_BITS, false, SCAN_ALGORITHM, WARP_MATCH_ANY, RANK_NUM_PARTS>>>;
diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
index a796c7dd153..3492bd5f41d 100644
--- a/cub/cub/agent/agent_reduce.cuh
+++ b/cub/cub/agent/agent_reduce.cuh
@@ -51,6 +51,8 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
@@ -145,9 +147,9 @@ struct AgentReduce
   // Wrap the native input pointer with CacheModifiedInputIterator
   // or directly use the supplied input iterator type
   using WrappedInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<InputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,
-                               InputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
 
   /// Constants
   static constexpr int BLOCK_THREADS      = AgentReducePolicy::BLOCK_THREADS;
diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh
index a1ff251e621..7e14b793db9 100644
--- a/cub/cub/agent/agent_reduce_by_key.cuh
+++ b/cub/cub/agent/agent_reduce_by_key.cuh
@@ -51,6 +51,8 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -225,27 +227,27 @@ struct AgentReduceByKey
   // CacheModifiedValuesInputIterator or directly use the supplied input
   // iterator type
   using WrappedKeysInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<KeysInputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,
-                               KeysInputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<KeysInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,
+                     KeysInputIteratorT>;
 
   // Cache-modified Input iterator wrapper type (for applying cache modifier)
   // for values Wrap the native input pointer with
   // CacheModifiedValuesInputIterator or directly use the supplied input
   // iterator type
   using WrappedValuesInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<ValuesInputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,
-                               ValuesInputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<ValuesInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,
+                     ValuesInputIteratorT>;
 
   // Cache-modified Input iterator wrapper type (for applying cache modifier)
   // for fixup values Wrap the native input pointer with
   // CacheModifiedValuesInputIterator or directly use the supplied input
   // iterator type
   using WrappedFixupInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<AggregatesOutputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,
-                               AggregatesOutputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<AggregatesOutputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,
+                     AggregatesOutputIteratorT>;
 
   // Reduce-value-by-segment scan operator
   using ReduceBySegmentOpT = ReduceBySegmentOp<ReductionOpT>;
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index 08723868088..c498f1737d4 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -54,6 +54,8 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -231,9 +233,9 @@ struct AgentRle
   // Wrap the native input pointer with CacheModifiedVLengthnputIterator
   // Directly use the supplied input iterator type
   using WrappedInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<InputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,
-                               InputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,
+                     InputIteratorT>;
 
   // Parameterized BlockLoad type for data
   using BlockLoadT =
@@ -257,7 +259,7 @@ struct AgentRle
   using WarpExchangePairs = WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>;
 
   using WarpExchangePairsStorage =
-    cub::detail::conditional_t<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>;
+    ::cuda::std::_If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>;
 
   using WarpExchangeOffsets = WarpExchange<OffsetT, ITEMS_PER_THREAD>;
   using WarpExchangeLengths = WarpExchange<LengthT, ITEMS_PER_THREAD>;
diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh
index d7bbab4dbad..7da0fec7cdd 100644
--- a/cub/cub/agent/agent_scan.cuh
+++ b/cub/cub/agent/agent_scan.cuh
@@ -50,6 +50,8 @@
 #include <cub/grid/grid_queue.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -157,9 +159,9 @@ struct AgentScan
   // Wrap the native input pointer with CacheModifiedInputIterator
   // or directly use the supplied input iterator type
   using WrappedInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<InputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
-                               InputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
 
   // Constants
   enum
diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh
index 42114bf5dd5..306ef9006c4 100644
--- a/cub/cub/agent/agent_scan_by_key.cuh
+++ b/cub/cub/agent/agent_scan_by_key.cuh
@@ -50,6 +50,8 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -152,14 +154,14 @@ struct AgentScanByKey
   static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
 
   using WrappedKeysInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<KeysInputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, KeyT, OffsetT>,
-                               KeysInputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<KeysInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, KeyT, OffsetT>,
+                     KeysInputIteratorT>;
 
   using WrappedValuesInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<ValuesInputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
-                               ValuesInputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<ValuesInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     ValuesInputIteratorT>;
 
   using BlockLoadKeysT = BlockLoad<KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentScanByKeyPolicyT::LOAD_ALGORITHM>;
 
diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh
index 49924ee2ce7..4f01df6a09f 100644
--- a/cub/cub/agent/agent_segment_fixup.cuh
+++ b/cub/cub/agent/agent_segment_fixup.cuh
@@ -52,6 +52,8 @@
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -171,18 +173,18 @@ struct AgentSegmentFixup
   // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
   // Wrap the native input pointer with CacheModifiedValuesInputIterator
   // or directly use the supplied input iterator type
-  using WrappedPairsInputIteratorT = cub::detail::conditional_t<
-    std::is_pointer<PairsInputIteratorT>::value,
-    CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,
-    PairsInputIteratorT>;
+  using WrappedPairsInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<PairsInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,
+                     PairsInputIteratorT>;
 
   // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
   // Wrap the native input pointer with CacheModifiedValuesInputIterator
   // or directly use the supplied input iterator type
   using WrappedFixupInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<AggregatesOutputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,
-                               AggregatesOutputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<AggregatesOutputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,
+                     AggregatesOutputIteratorT>;
 
   // Reduce-value-by-segment scan operator
   using ReduceBySegmentOpT = ReduceByKeyOp<cub::Sum>;
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index a48fa175807..ab9c982dca7 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -219,17 +219,17 @@ struct AgentSelectIf
   // Wrap the native input pointer with CacheModifiedValuesInputIterator
   // or directly use the supplied input iterator type
   using WrappedInputIteratorT =
-    cub::detail::conditional_t<::cuda::std::is_pointer<InputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
-                               InputIteratorT>;
+    ::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
 
   // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
   // Wrap the native input pointer with CacheModifiedValuesInputIterator
   // or directly use the supplied input iterator type
   using WrappedFlagsInputIteratorT =
-    cub::detail::conditional_t<::cuda::std::is_pointer<FlagsInputIteratorT>::value,
-                               CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,
-                               FlagsInputIteratorT>;
+    ::cuda::std::_If<::cuda::std::is_pointer<FlagsInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,
+                     FlagsInputIteratorT>;
 
   // Parameterized BlockLoad type for input data
   using BlockLoadT = BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentSelectIfPolicyT::LOAD_ALGORITHM>;
diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
index 3853a059272..a392359537f 100644
--- a/cub/cub/agent/agent_spmv_orig.cuh
+++ b/cub/cub/agent/agent_spmv_orig.cuh
@@ -52,6 +52,8 @@
 #include <cub/thread/thread_search.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -264,7 +266,7 @@ struct AgentSpmv
   {
     // Value type to pair with index type OffsetT
     // (NullType if loading values directly during merge)
-    using MergeValueT = cub::detail::conditional_t<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>;
+    using MergeValueT = ::cuda::std::_If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>;
 
     OffsetT row_end_offset;
     MergeValueT nonzero;
diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh
index 85933fa5aa4..0c0556ffe79 100644
--- a/cub/cub/agent/agent_three_way_partition.cuh
+++ b/cub/cub/agent/agent_three_way_partition.cuh
@@ -197,9 +197,9 @@ struct AgentThreeWayPartition
   static constexpr int TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD;
 
   using WrappedInputIteratorT =
-    cub::detail::conditional_t<std::is_pointer<InputIteratorT>::value,
-                               cub::CacheModifiedInputIterator<PolicyT::LOAD_MODIFIER, InputT, OffsetT>,
-                               InputIteratorT>;
+    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+                     cub::CacheModifiedInputIterator<PolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
 
   // Parameterized BlockLoad type for input data
   using BlockLoadT = cub::BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM>;
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index 67ffb965017..312b7ac98c5 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -51,6 +51,8 @@
 #include <cub/util_temporary_storage.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 #include <nv/target>
@@ -476,16 +478,16 @@ using default_no_delay_t             = default_no_delay_constructor_t::delay_t;
 
 template <class T>
 using default_delay_constructor_t =
-  cub::detail::conditional_t<Traits<T>::PRIMITIVE, fixed_delay_constructor_t<350, 450>, default_no_delay_constructor_t>;
+  ::cuda::std::_If<Traits<T>::PRIMITIVE, fixed_delay_constructor_t<350, 450>, default_no_delay_constructor_t>;
 
 template <class T>
 using default_delay_t = typename default_delay_constructor_t<T>::delay_t;
 
 template <class KeyT, class ValueT>
 using default_reduce_by_key_delay_constructor_t =
-  detail::conditional_t<(Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16),
-                        reduce_by_key_delay_constructor_t<350, 450>,
-                        default_delay_constructor_t<KeyValuePair<KeyT, ValueT>>>;
+  ::cuda::std::_If<(Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16),
+                   reduce_by_key_delay_constructor_t<350, 450>,
+                   default_delay_constructor_t<KeyValuePair<KeyT, ValueT>>>;
 } // namespace detail
 
 /**
@@ -503,16 +505,13 @@ template <typename T>
 struct ScanTileState<T, true>
 {
   // Status word type
-  using StatusWord = cub::detail::conditional_t<
+  using StatusWord = ::cuda::std::_If<
     sizeof(T) == 8,
     unsigned long long,
-    cub::detail::conditional_t<sizeof(T) == 4,
-                               unsigned int,
-                               cub::detail::conditional_t<sizeof(T) == 2, unsigned short, unsigned char>>>;
+    ::cuda::std::_If<sizeof(T) == 4, unsigned int, ::cuda::std::_If<sizeof(T) == 2, unsigned short, unsigned char>>>;
 
   // Unit word type
-  using TxnWord = cub::detail::
-    conditional_t<sizeof(T) == 8, ulonglong2, cub::detail::conditional_t<sizeof(T) == 4, uint2, unsigned int>>;
+  using TxnWord = ::cuda::std::_If<sizeof(T) == 8, ulonglong2, ::cuda::std::_If<sizeof(T) == 4, uint2, unsigned int>>;
 
   // Device word type
   struct TileDescriptor
@@ -889,18 +888,15 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
   };
 
   // Status word type
-  using StatusWord = cub::detail::conditional_t<
+  using StatusWord = ::cuda::std::_If<
     STATUS_WORD_SIZE == 8,
     unsigned long long,
-    cub::detail::conditional_t<STATUS_WORD_SIZE == 4,
-                               unsigned int,
-                               cub::detail::conditional_t<STATUS_WORD_SIZE == 2, unsigned short, unsigned char>>>;
+    ::cuda::std::
+      _If<STATUS_WORD_SIZE == 4, unsigned int, ::cuda::std::_If<STATUS_WORD_SIZE == 2, unsigned short, unsigned char>>>;
 
   // Status word type
-  using TxnWord =
-    cub::detail::conditional_t<TXN_WORD_SIZE == 16,
-                               ulonglong2,
-                               cub::detail::conditional_t<TXN_WORD_SIZE == 8, unsigned long long, unsigned int>>;
+  using TxnWord = ::cuda::std::
+    _If<TXN_WORD_SIZE == 16, ulonglong2, ::cuda::std::_If<TXN_WORD_SIZE == 8, unsigned long long, unsigned int>>;
 
   // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
   struct TileDescriptorBigStatus
@@ -920,7 +916,7 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
 
   // Device word type
   using TileDescriptor =
-    cub::detail::conditional_t<sizeof(ValueT) == sizeof(KeyT), TileDescriptorBigStatus, TileDescriptorLittleStatus>;
+    ::cuda::std::_If<sizeof(ValueT) == sizeof(KeyT), TileDescriptorBigStatus, TileDescriptorLittleStatus>;
 
   // Device storage
   TxnWord* d_tile_descriptors;
diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index 649f2a563f1..709e9c1bd07 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -143,7 +143,7 @@ private:
   }
 
   /// Specialization for when FlagOp has third index param
-  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::value>
   struct ApplyOp
   {
     // Apply flag operator
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index 6ec1e05616b..95fe29df5d1 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -149,7 +149,7 @@ private:
   }
 
   /// Specialization for when FlagOp has third index param
-  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::value>
   struct ApplyOp
   {
     // Apply flag operator
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index 864b4df9cb1..3553ec79da6 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -48,6 +48,8 @@
 #include <cub/block/specializations/block_histogram_sort.cuh>
 #include <cub/util_ptx.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 //! @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of
@@ -199,9 +201,9 @@ private:
 
   /// Internal specialization.
   using InternalBlockHistogram =
-    cub::detail::conditional_t<ALGORITHM == BLOCK_HISTO_SORT,
-                               BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
-                               BlockHistogramAtomic<BINS>>;
+    ::cuda::std::_If<ALGORITHM == BLOCK_HISTO_SORT,
+                     BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
+                     BlockHistogramAtomic<BINS>>;
 
   /// Shared memory storage layout type for BlockHistogram
   using _TempStorage = typename InternalBlockHistogram::TempStorage;
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 23ad226cef0..c91731ae033 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -221,7 +221,7 @@ private:
 
   // Integer type for packing DigitCounters into columns of shared memory banks
   using PackedCounter =
-    cub::detail::conditional_t<SMEM_CONFIG == cudaSharedMemBankSizeEightByte, unsigned long long, unsigned int>;
+    ::cuda::std::_If<SMEM_CONFIG == cudaSharedMemBankSizeEightByte, unsigned long long, unsigned int>;
 
   static constexpr DigitCounter max_tile_size = ::cuda::std::numeric_limits<DigitCounter>::max();
 
@@ -1195,16 +1195,16 @@ namespace detail
 // - Support multi-dimensional thread blocks in the rest of implementations
 // - Repurpose BlockRadixRank as an entry name with the algorithm template parameter
 template <RadixRankAlgorithm RankAlgorithm, int BlockDimX, int RadixBits, bool IsDescending, BlockScanAlgorithm ScanAlgorithm>
-using block_radix_rank_t = cub::detail::conditional_t<
+using block_radix_rank_t = ::cuda::std::_If<
   RankAlgorithm == RADIX_RANK_BASIC,
   BlockRadixRank<BlockDimX, RadixBits, IsDescending, false, ScanAlgorithm>,
-  cub::detail::conditional_t<
+  ::cuda::std::_If<
     RankAlgorithm == RADIX_RANK_MEMOIZE,
     BlockRadixRank<BlockDimX, RadixBits, IsDescending, true, ScanAlgorithm>,
-    cub::detail::conditional_t<
+    ::cuda::std::_If<
       RankAlgorithm == RADIX_RANK_MATCH,
       BlockRadixRankMatch<BlockDimX, RadixBits, IsDescending, ScanAlgorithm>,
-      cub::detail::conditional_t<
+      ::cuda::std::_If<
         RankAlgorithm == RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
         BlockRadixRankMatchEarlyCounts<BlockDimX, RadixBits, IsDescending, ScanAlgorithm, WARP_MATCH_ANY>,
         BlockRadixRankMatchEarlyCounts<BlockDimX, RadixBits, IsDescending, ScanAlgorithm, WARP_MATCH_ATOMIC_OR>>>>>;
diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh
index 0b2c1c53e0b..d35c90c06d4 100644
--- a/cub/cub/block/block_reduce.cuh
+++ b/cub/cub/block/block_reduce.cuh
@@ -48,6 +48,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -253,11 +255,11 @@ private:
 
   /// Internal specialization type
   using InternalBlockReduce =
-    cub::detail::conditional_t<ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS,
-                               WarpReductions,
-                               cub::detail::conditional_t<ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
-                                                          RakingCommutativeOnly,
-                                                          Raking>>; // BlockReduceRaking
+    ::cuda::std::_If<ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS,
+                     WarpReductions,
+                     ::cuda::std::_If<ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+                                      RakingCommutativeOnly,
+                                      Raking>>; // BlockReduceRaking
 
   /// Shared memory storage layout type for BlockReduce
   using _TempStorage = typename InternalBlockReduce::TempStorage;
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index adfba7dada6..a06b7c185fb 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -46,6 +46,8 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -252,7 +254,7 @@ private:
     BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
 
   /// Define the delegate type for the desired algorithm
-  using InternalBlockScan = cub::detail::conditional_t<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
+  using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
 
   /// Shared memory storage layout type for BlockScan
   using _TempStorage = typename InternalBlockScan::TempStorage;
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 57969548a2f..93c93e4c489 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -454,11 +454,11 @@ struct DispatchBatchMemcpy : SelectedPolicy
     // The number of thread blocks (or tiles) required to process all of the given buffers
     BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE);
 
-    using BlevBufferSrcsOutT    = cub::detail::conditional_t<IsMemcpy, void*, cub::detail::value_t<InputBufferIt>>;
-    using BlevBufferDstOutT     = cub::detail::conditional_t<IsMemcpy, void*, cub::detail::value_t<OutputBufferIt>>;
-    using BlevBufferSrcsOutItT  = BlevBufferSrcsOutT*;
-    using BlevBufferDstsOutItT  = BlevBufferDstOutT*;
-    using BlevBufferSizesOutItT = BufferSizeT*;
+    using BlevBufferSrcsOutT          = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<InputBufferIt>>;
+    using BlevBufferDstOutT           = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<OutputBufferIt>>;
+    using BlevBufferSrcsOutItT        = BlevBufferSrcsOutT*;
+    using BlevBufferDstsOutItT        = BlevBufferDstOutT*;
+    using BlevBufferSizesOutItT       = BufferSizeT*;
     using BlevBufferTileOffsetsOutItT = BlockOffsetT*;
 
     temporary_storage::layout<MEM_NUM_ALLOCATIONS> temporary_storage_layout;
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 0574305cc10..ad16b68e57a 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -611,9 +611,9 @@ public:
       // Wrap the native input pointer with CacheModifiedInputIterator
       // or Directly use the supplied input iterator type
       using WrappedLevelIteratorT =
-        cub::detail::conditional_t<std::is_pointer<LevelIteratorT>::value,
-                                   CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,
-                                   LevelIteratorT>;
+        ::cuda::std::_If<std::is_pointer<LevelIteratorT>::value,
+                         CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,
+                         LevelIteratorT>;
 
       WrappedLevelIteratorT wrapped_levels(d_levels);
 
@@ -647,11 +647,11 @@ public:
     // rule: 2^l * 2^r = 2^(l + r) to determine a sufficiently large type to hold the
     // multiplication result.
     // If CommonT used to be a 128-bit wide integral type already, we use CommonT's arithmetic
-    using IntArithmeticT = cub::detail::conditional_t< //
+    using IntArithmeticT = ::cuda::std::_If< //
       sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), //
       uint32_t, //
 #if CUB_IS_INT128_ENABLED
-      cub::detail::conditional_t< //
+      ::cuda::std::_If< //
         (::cuda::std::is_same<CommonT, __int128_t>::value || //
          ::cuda::std::is_same<CommonT, __uint128_t>::value), //
         CommonT, //
@@ -665,10 +665,9 @@ public:
     template <typename T>
     using is_integral_excl_int128 =
 #if CUB_IS_INT128_ENABLED
-      cub::detail::conditional_t<
-        ::cuda::std::is_same<T, __int128_t>::value&& ::cuda::std::is_same<T, __uint128_t>::value,
-        ::cuda::std::false_type,
-        ::cuda::std::is_integral<T>>;
+      ::cuda::std::_If<::cuda::std::is_same<T, __int128_t>::value&& ::cuda::std::is_same<T, __uint128_t>::value,
+                       ::cuda::std::false_type,
+                       ::cuda::std::is_integral<T>>;
 #else
       ::cuda::std::is_integral<T>;
 #endif
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 8bb025fb687..11939b632c7 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -47,6 +47,8 @@
 #include <thrust/detail/integer_math.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -130,10 +132,10 @@ private:
     (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block);
 
 public:
-  using policy_t = cub::detail::conditional_t<uses_fallback_policy, fallback_policy_t, DefaultPolicyT>;
+  using policy_t = ::cuda::std::_If<uses_fallback_policy, fallback_policy_t, DefaultPolicyT>;
   using block_sort_agent_t =
-    cub::detail::conditional_t<uses_fallback_policy, fallback_block_sort_agent_t, default_block_sort_agent_t>;
-  using merge_agent_t = cub::detail::conditional_t<uses_fallback_policy, fallback_merge_agent_t, default_merge_agent_t>;
+    ::cuda::std::_If<uses_fallback_policy, fallback_block_sort_agent_t, default_block_sort_agent_t>;
+  using merge_agent_t = ::cuda::std::_If<uses_fallback_policy, fallback_merge_agent_t, default_merge_agent_t>;
 };
 } // namespace detail
 
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 1ecc1240a0c..fc0d8b8c225 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -59,6 +59,8 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 #include <stdio.h>
@@ -131,14 +133,14 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
     DecomposerT decomposer = {})
 {
   using ActiveUpsweepPolicyT =
-    cub::detail::conditional_t<ALT_DIGIT_BITS,
-                               typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
-                               typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
 
   using ActiveDownsweepPolicyT =
-    cub::detail::conditional_t<ALT_DIGIT_BITS,
-                               typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
-                               typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
 
   enum
   {
@@ -284,14 +286,14 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
     DecomposerT decomposer = {})
 {
   using ActiveUpsweepPolicyT =
-    cub::detail::conditional_t<ALT_DIGIT_BITS,
-                               typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
-                               typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
 
   using ActiveDownsweepPolicyT =
-    cub::detail::conditional_t<ALT_DIGIT_BITS,
-                               typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
-                               typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
 
   enum
   {
@@ -547,9 +549,9 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
   //
 
   using SegmentedPolicyT =
-    cub::detail::conditional_t<ALT_DIGIT_BITS,
-                               typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
-                               typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>;
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+                     typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>;
 
   enum
   {
@@ -892,7 +894,7 @@ struct DeviceRadixSortPolicy
   static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
 
   // Dominant-sized key/value type
-  using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
 
   //------------------------------------------------------------------------------
   // Architecture-specific tuning policies
@@ -963,9 +965,9 @@ struct DeviceRadixSortPolicy
       PRIMARY_RADIX_BITS - 1>;
 
     // Downsweep policies
-    using DownsweepPolicy = cub::detail::conditional_t<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>;
+    using DownsweepPolicy = ::cuda::std::_If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>;
 
-    using AltDownsweepPolicy = cub::detail::conditional_t<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>;
+    using AltDownsweepPolicy = ::cuda::std::_If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>;
 
     // Upsweep policies
     using UpsweepPolicy    = DownsweepPolicy;
@@ -1575,7 +1577,7 @@ struct DeviceRadixSortPolicy
       ONESWEEP_RADIX_BITS>;
 
     using OnesweepLargeKeyPolicy = //
-      cub::detail::conditional_t<sizeof(KeyT) == 4, OnesweepPolicyKey32, OnesweepPolicyKey64>;
+      ::cuda::std::_If<sizeof(KeyT) == 4, OnesweepPolicyKey32, OnesweepPolicyKey64>;
 
     using OnesweepSmallKeyPolicySizes = //
       detail::radix::sm90_small_key_tuning<sizeof(KeyT), KEYS_ONLY ? 0 : sizeof(ValueT), sizeof(OffsetT)>;
@@ -1589,9 +1591,9 @@ struct DeviceRadixSortPolicy
       RADIX_SORT_STORE_DIRECT,
       8>;
     using OnesweepPolicy = //
-      cub::detail::conditional_t<sizeof(KeyT) < 4, //
-                                 OnesweepSmallKeyPolicy, //
-                                 OnesweepLargeKeyPolicy>;
+      ::cuda::std::_If<sizeof(KeyT) < 4, //
+                       OnesweepSmallKeyPolicy, //
+                       OnesweepLargeKeyPolicy>;
 
     using ScanPolicy =
       AgentScanPolicy<512,
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index b1e31c53a12..346bda4c286 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -56,6 +56,8 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -228,9 +230,9 @@ template <typename InputIteratorT,
           typename InitValueT,
           typename OffsetT,
           typename AccumT         = detail::accumulator_t<ScanOpT,
-                                                  cub::detail::conditional_t<std::is_same<InitValueT, NullType>::value,
-                                                                             cub::detail::value_t<InputIteratorT>,
-                                                                             typename InitValueT::value_type>,
+                                                  ::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
+                                                                   cub::detail::value_t<InputIteratorT>,
+                                                                   typename InitValueT::value_type>,
                                                   cub::detail::value_t<InputIteratorT>>,
           typename SelectedPolicy = DeviceScanPolicy<AccumT, ScanOpT>>
 struct DispatchScan : SelectedPolicy
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 9a1cdad9704..0d0ce192415 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -54,6 +54,8 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/type_traits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -228,8 +230,7 @@ template <
   typename OffsetT,
   typename AccumT = detail::accumulator_t<
     ScanOpT,
-    cub::detail::
-      conditional_t<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>,
+    ::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>,
     cub::detail::value_t<ValuesInputIteratorT>>,
   typename SelectedPolicy =
     DeviceScanByKeyPolicy<KeysInputIteratorT, AccumT, cub::detail::value_t<ValuesInputIteratorT>, ScanOpT>>
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 656fc2574d9..84c81f34a98 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -58,6 +58,8 @@
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/type_traits>
+
 #include <type_traits>
 
 #include <nv/target>
@@ -694,7 +696,7 @@ __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContin
 template <typename KeyT, typename ValueT>
 struct DeviceSegmentedSortPolicy
 {
-  using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
 
   static constexpr int KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
 
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index d4b61814d0d..2998608d567 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -673,8 +673,7 @@ template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
 struct ChainedPolicy
 {
   /// The policy for the active compiler pass
-  using ActivePolicy =
-    cub::detail::conditional_t<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;
+  using ActivePolicy = ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;
 
   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
   template <typename FunctorT>
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index d8c03500081..5afd2dd23aa 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,6 +46,7 @@
 #include <cub/detail/uninitialized_copy.cuh>
 
 #include <cuda/std/cstdint>
+#include <cuda/std/iterator>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
@@ -58,12 +59,6 @@ _CCCL_DIAG_POP
 #  endif // !_CCCL_CUDACC_BELOW_11_8
 #endif // _CCCL_HAS_NV_BF16
 
-#if !defined(_CCCL_COMPILER_NVRTC)
-#  include <iterator>
-#else
-#  include <cuda/std/iterator>
-#endif
-
 CUB_NAMESPACE_BEGIN
 
 #ifndef CUB_IS_INT128_ENABLED
@@ -88,22 +83,18 @@ CUB_NAMESPACE_BEGIN
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 namespace detail
 {
-
-template <bool Test, class T1, class T2>
-using conditional_t = typename ::cuda::std::conditional<Test, T1, T2>::type;
-
+//! Alias to the given iterator's value_type.
+// Aliases to std::iterator_traits, since users can specialize this template to provide traits for their iterators. We
+// only defer to the libcu++ implementation for NVRTC.
 template <typename Iterator>
 using value_t =
-#  if !defined(_CCCL_COMPILER_NVRTC)
-  typename std::iterator_traits<Iterator>::value_type;
-#  else // defined(_CCCL_COMPILER_NVRTC)
+#  ifdef _CCCL_COMPILER_NVRTC
   typename ::cuda::std::iterator_traits<Iterator>::value_type;
+#  else // !defined(_CCCL_COMPILER_NVRTC)
+  typename std::iterator_traits<Iterator>::value_type;
 #  endif // defined(_CCCL_COMPILER_NVRTC)
 
-template <typename It,
-          typename FallbackT,
-          bool = ::cuda::std::
-            is_same<typename ::cuda::std::remove_cv<typename ::cuda::std::remove_pointer<It>::type>::type, void>::value>
+template <typename It, typename FallbackT, bool = ::cuda::std::is_void<::cuda::std::__remove_pointer_t<It>>::value>
 struct non_void_value_impl
 {
   using type = FallbackT;
@@ -112,8 +103,7 @@ struct non_void_value_impl
 template <typename It, typename FallbackT>
 struct non_void_value_impl<It, FallbackT, false>
 {
-  using type =
-    typename ::cuda::std::conditional<::cuda::std::is_same<value_t<It>, void>::value, FallbackT, value_t<It>>::type;
+  using type = ::cuda::std::_If<::cuda::std::is_void<value_t<It>>::value, FallbackT, value_t<It>>;
 };
 
 /**
@@ -326,17 +316,8 @@ private:
 template <typename T>
 struct AlignBytes
 {
-  struct Pad
-  {
-    T val;
-    char byte;
-  };
-
-  enum
-  {
-    /// The "true CUDA" alignment of T in bytes
-    ALIGN_BYTES = sizeof(Pad) - sizeof(T)
-  };
+  /// The "true CUDA" alignment of T in bytes
+  static constexpr unsigned ALIGN_BYTES = alignof(T);
 
   /// The "truly aligned" type
   using Type = T;
@@ -350,10 +331,8 @@ struct AlignBytes
     template <>                                                                                    \
     struct AlignBytes<t>                                                                           \
     {                                                                                              \
-      enum                                                                                         \
-      {                                                                                            \
-        ALIGN_BYTES = b                                                                            \
-      };                                                                                           \
+      static constexpr unsigned ALIGN_BYTES = b;                                                   \
+                                                                                                   \
       typedef __align__(b) t Type;                                                                 \
       /* TODO(bgruber): rewriting the above to using Type __align__(b) = t; does not compile :S */ \
     };
@@ -395,42 +374,31 @@ template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
 template <typename T>
 struct UnitWord
 {
-  enum
-  {
-    ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
-  };
+  static constexpr auto ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES;
 
   template <typename Unit>
   struct IsMultiple
   {
-    enum
-    {
-      UNIT_ALIGN_BYTES = AlignBytes<Unit>::ALIGN_BYTES,
-      IS_MULTIPLE      = (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0)
-    };
+    static constexpr auto UNIT_ALIGN_BYTES = AlignBytes<Unit>::ALIGN_BYTES;
+    static constexpr bool IS_MULTIPLE =
+      (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0);
   };
 
-  /// Biggest shuffle word that T is a whole multiple of and is not larger than
-  /// the alignment of T
-  using ShuffleWord = cub::detail::conditional_t<
-    IsMultiple<int>::IS_MULTIPLE,
-    unsigned int,
-    cub::detail::conditional_t<IsMultiple<short>::IS_MULTIPLE, unsigned short, unsigned char>>;
-
-  /// Biggest volatile word that T is a whole multiple of and is not larger than
-  /// the alignment of T
-  using VolatileWord = cub::detail::conditional_t<IsMultiple<long long>::IS_MULTIPLE, unsigned long long, ShuffleWord>;
-
-  /// Biggest memory-access word that T is a whole multiple of and is not larger
-  /// than the alignment of T
-  using DeviceWord = cub::detail::conditional_t<IsMultiple<longlong2>::IS_MULTIPLE, ulonglong2, VolatileWord>;
-
-  /// Biggest texture reference word that T is a whole multiple of and is not
-  /// larger than the alignment of T
-  using TextureWord =
-    cub::detail::conditional_t<IsMultiple<int4>::IS_MULTIPLE,
-                               uint4,
-                               cub::detail::conditional_t<IsMultiple<int2>::IS_MULTIPLE, uint2, ShuffleWord>>;
+  /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+  using ShuffleWord =
+    ::cuda::std::_If<IsMultiple<int>::IS_MULTIPLE,
+                     unsigned int,
+                     ::cuda::std::_If<IsMultiple<short>::IS_MULTIPLE, unsigned short, unsigned char>>;
+
+  /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+  using VolatileWord = ::cuda::std::_If<IsMultiple<long long>::IS_MULTIPLE, unsigned long long, ShuffleWord>;
+
+  /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+  using DeviceWord = ::cuda::std::_If<IsMultiple<longlong2>::IS_MULTIPLE, ulonglong2, VolatileWord>;
+
+  /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+  using TextureWord = ::cuda::std::
+    _If<IsMultiple<int4>::IS_MULTIPLE, uint4, ::cuda::std::_If<IsMultiple<int2>::IS_MULTIPLE, uint2, ShuffleWord>>;
 };
 
 // float2 specialization workaround (for SM10-SM13)
@@ -483,11 +451,8 @@ struct CubVector
   static_assert(!sizeof(T), "CubVector can only have 1-4 elements");
 };
 
-enum
-{
-  /// The maximum number of elements in CUDA vector types
-  MAX_VEC_ELEMENTS = 4,
-};
+/// The maximum number of elements in CUDA vector types
+_LIBCUDACXX_INLINE_VAR constexpr int MAX_VEC_ELEMENTS = 4;
 
 /**
  * Generic vector-1 type
@@ -498,7 +463,7 @@ struct CubVector<T, 1>
   T x;
 
   using BaseType = T;
-  using Type     = CubVector<T, 1>;
+  using Type     = CubVector;
 };
 
 /**
@@ -511,7 +476,7 @@ struct CubVector<T, 2>
   T y;
 
   using BaseType = T;
-  using Type     = CubVector<T, 2>;
+  using Type     = CubVector;
 };
 
 /**
@@ -525,7 +490,7 @@ struct CubVector<T, 3>
   T z;
 
   using BaseType = T;
-  using Type     = CubVector<T, 3>;
+  using Type     = CubVector;
 };
 
 /**
@@ -540,7 +505,7 @@ struct CubVector<T, 4>
   T w;
 
   using BaseType = T;
-  using Type     = CubVector<T, 4>;
+  using Type     = CubVector;
 };
 
 /**
@@ -654,7 +619,6 @@ CUB_DEFINE_VECTOR_TYPE(double,             double)
 CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
 // clang-format on
 
-// Undefine macros
 #  undef CUB_DEFINE_VECTOR_TYPE
 
 /******************************************************************************
@@ -820,28 +784,20 @@ template <typename T>
 struct DoubleBuffer
 {
   /// Pair of device buffer pointers
-  T* d_buffers[2];
+  T* d_buffers[2]{};
 
   ///  Selector into \p d_buffers (i.e., the active/valid buffer)
-  int selector;
+  int selector = 0;
 
   /// \brief Constructor
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE DoubleBuffer()
-  {
-    selector     = 0;
-    d_buffers[0] = nullptr;
-    d_buffers[1] = nullptr;
-  }
+  DoubleBuffer() = default;
 
   /// \brief Constructor
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE DoubleBuffer(T* d_current, ///< The currently valid buffer
                                                    T* d_alternate) ///< Alternate storage buffer of the same size as \p
                                                                    ///< d_current
-  {
-    selector     = 0;
-    d_buffers[0] = d_current;
-    d_buffers[1] = d_alternate;
-  }
+      : d_buffers{d_current, d_alternate}
+  {}
 
   /// \brief Return pointer to the currently valid buffer
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* Current()
@@ -862,20 +818,18 @@ struct DoubleBuffer
 
 /**
  * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a
- * constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ * constant member \p value indicating whether or not parameter \p T exposes a nested type \p nested_type_name
  */
-#  define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \
-    template <typename T>                                                \
-    struct detector_name                                                 \
-    {                                                                    \
-      template <typename C>                                              \
-      static char& test(typename C::nested_type_name*);                  \
-      template <typename>                                                \
-      static int& test(...);                                             \
-      enum                                                               \
-      {                                                                  \
-        VALUE = sizeof(test<T>(0)) < sizeof(int)                         \
-      };                                                                 \
+#  define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)                                  \
+    template <typename T, typename = void>                                                                \
+    struct detector_name : ::cuda::std::false_type                                                        \
+    {                                                                                                     \
+      CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool VALUE = false;                  \
+    };                                                                                                    \
+    template <typename T>                                                                                 \
+    struct detector_name<T, ::cuda::std::__void_t<typename T::nested_type_name>> : ::cuda::std::true_type \
+    {                                                                                                     \
+      CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool VALUE = true;                   \
     };
 
 /******************************************************************************
@@ -886,50 +840,19 @@ struct DoubleBuffer
  * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or
  * <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
  */
+template <typename T, typename BinaryOp, typename = void>
+struct BinaryOpHasIdxParam : ::cuda::std::false_type
+{
+  CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool HAS_PARAM = false;
+};
+
 template <typename T, typename BinaryOp>
-struct BinaryOpHasIdxParam
+struct BinaryOpHasIdxParam<T,
+                           BinaryOp,
+                           ::cuda::std::__void_t<decltype(::cuda::std::declval<BinaryOp>()(
+                             ::cuda::std::declval<T>(), ::cuda::std::declval<T>(), int{}))>> : ::cuda::std::true_type
 {
-private:
-  /*
-      template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1
-     {}; template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct
-     SFINAE2 {}; template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const> struct SFINAE3 {};
-      template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4
-     {};
-  */
-  template <typename BinaryOpT, bool (BinaryOpT::*)(const T& a, const T& b, int idx) const>
-  struct SFINAE5
-  {};
-  template <typename BinaryOpT, bool (BinaryOpT::*)(const T& a, const T& b, int idx)>
-  struct SFINAE6
-  {};
-  template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>
-  struct SFINAE7
-  {};
-  template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>
-  struct SFINAE8
-  {};
-  /*
-      template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
-      template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
-      template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
-      template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
-  */
-  template <typename BinaryOpT>
-  _CCCL_HOST_DEVICE static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()>*);
-  template <typename BinaryOpT>
-  _CCCL_HOST_DEVICE static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()>*);
-  template <typename BinaryOpT>
-  _CCCL_HOST_DEVICE static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()>*);
-  template <typename BinaryOpT>
-  _CCCL_HOST_DEVICE static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()>*);
-
-  template <typename BinaryOpT>
-  _CCCL_HOST_DEVICE static int Test(...);
-
-public:
-  /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
-  static constexpr bool HAS_PARAM = sizeof(Test<BinaryOp>(nullptr)) == sizeof(char);
+  CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool HAS_PARAM = true;
 };
 
 /******************************************************************************
@@ -960,13 +883,9 @@ enum Category
 template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
 struct BaseTraits
 {
-  /// Category
   static constexpr Category CATEGORY = _CATEGORY;
-  enum
-  {
-    PRIMITIVE = _PRIMITIVE,
-    NULL_TYPE = _NULL_TYPE,
-  };
+  static constexpr bool PRIMITIVE    = _PRIMITIVE;
+  static constexpr bool NULL_TYPE    = _NULL_TYPE;
 };
 
 /**
@@ -980,12 +899,8 @@ struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
   static constexpr Category CATEGORY       = UNSIGNED_INTEGER;
   static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0);
   static constexpr UnsignedBits MAX_KEY    = UnsignedBits(-1);
-
-  enum
-  {
-    PRIMITIVE = true,
-    NULL_TYPE = false,
-  };
+  static constexpr bool PRIMITIVE          = true;
+  static constexpr bool NULL_TYPE          = false;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
   {
@@ -1026,12 +941,8 @@ struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
   static constexpr UnsignedBits HIGH_BIT   = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
   static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT;
   static constexpr UnsignedBits MAX_KEY    = UnsignedBits(-1) ^ HIGH_BIT;
-
-  enum
-  {
-    PRIMITIVE = true,
-    NULL_TYPE = false,
-  };
+  static constexpr bool PRIMITIVE          = true;
+  static constexpr bool NULL_TYPE          = false;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
   {
@@ -1178,12 +1089,8 @@ struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
   static constexpr UnsignedBits HIGH_BIT   = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
   static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(-1);
   static constexpr UnsignedBits MAX_KEY    = UnsignedBits(-1) ^ HIGH_BIT;
-
-  enum
-  {
-    PRIMITIVE = true,
-    NULL_TYPE = false,
-  };
+  static constexpr bool PRIMITIVE          = true;
+  static constexpr bool NULL_TYPE          = false;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
   {
@@ -1240,7 +1147,6 @@ struct NumericTraits<__uint128_t>
   static constexpr Category       CATEGORY    = UNSIGNED_INTEGER;
   static constexpr UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
   static constexpr UnsignedBits   MAX_KEY     = UnsignedBits(-1);
-
   static constexpr bool PRIMITIVE = false;
   static constexpr bool NULL_TYPE = false;
 
@@ -1275,7 +1181,6 @@ struct NumericTraits<__int128_t>
   static constexpr UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
   static constexpr UnsignedBits   LOWEST_KEY  = HIGH_BIT;
   static constexpr UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
   static constexpr bool PRIMITIVE = false;
   static constexpr bool NULL_TYPE = false;
 
diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh
index 5eebc8533db..6a0d6b9a94e 100644
--- a/cub/cub/util_vsmem.cuh
+++ b/cub/cub/util_vsmem.cuh
@@ -47,6 +47,7 @@
 #include <cub/util_type.cuh>
 
 #include <cuda/discard_memory>
+#include <cuda/std/type_traits>
 
 #include <cstdint>
 
@@ -95,7 +96,7 @@ private:
 
 public:
   // Type alias to be used for static temporary storage declaration within the algorithm's kernel
-  using static_temp_storage_t = cub::detail::conditional_t<needs_vsmem, cub::NullType, typename AgentT::TempStorage>;
+  using static_temp_storage_t = ::cuda::std::_If<needs_vsmem, cub::NullType, typename AgentT::TempStorage>;
 
   // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes
   static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0;
diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh
index 3f8ce8f22fb..712d0a6bcd3 100644
--- a/cub/cub/warp/warp_exchange.cuh
+++ b/cub/cub/warp/warp_exchange.cuh
@@ -48,6 +48,8 @@
 #include <cub/warp/specializations/warp_exchange_shfl.cuh>
 #include <cub/warp/specializations/warp_exchange_smem.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 enum WarpExchangeAlgorithm
@@ -60,9 +62,9 @@ namespace detail
 {
 template <typename InputT, int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS, WarpExchangeAlgorithm WARP_EXCHANGE_ALGORITHM>
 using InternalWarpExchangeImpl =
-  cub::detail::conditional_t<WARP_EXCHANGE_ALGORITHM == WARP_EXCHANGE_SMEM,
-                             WarpExchangeSmem<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>,
-                             WarpExchangeShfl<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>>;
+  ::cuda::std::_If<WARP_EXCHANGE_ALGORITHM == WARP_EXCHANGE_SMEM,
+                   WarpExchangeSmem<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>,
+                   WarpExchangeShfl<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>>;
 } // namespace detail
 
 /**
diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index e9b6896ca33..7785b8992a8 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -49,6 +49,8 @@
 #include <cub/warp/specializations/warp_reduce_shfl.cuh>
 #include <cub/warp/specializations/warp_reduce_smem.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -172,8 +174,8 @@ public:
 
   /// Internal specialization.
   /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpReduce = cub::detail::
-    conditional_t<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
+  using InternalWarpReduce =
+    ::cuda::std::_If<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh
index 71124764353..5daeec6e37d 100644
--- a/cub/cub/warp/warp_scan.cuh
+++ b/cub/cub/warp/warp_scan.cuh
@@ -49,6 +49,8 @@
 #include <cub/warp/specializations/warp_scan_shfl.cuh>
 #include <cub/warp/specializations/warp_scan_smem.cuh>
 
+#include <cuda/std/type_traits>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -177,8 +179,8 @@ private:
 
   /// Internal specialization.
   /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
-  using InternalWarpScan = cub::detail::
-    conditional_t<IS_POW_OF_TWO, WarpScanShfl<T, LOGICAL_WARP_THREADS>, WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
+  using InternalWarpScan =
+    ::cuda::std::_If<IS_POW_OF_TWO, WarpScanShfl<T, LOGICAL_WARP_THREADS>, WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
 
   /// Shared memory storage layout type for WarpScan
   using _TempStorage = typename InternalWarpScan::TempStorage;
diff --git a/cub/test/c2h/generators.cu b/cub/test/c2h/generators.cu
index e404136932a..20952f6ff94 100644
--- a/cub/test/c2h/generators.cu
+++ b/cub/test/c2h/generators.cu
@@ -40,6 +40,8 @@
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
 
+#include <cuda/std/type_traits>
+
 #include <cstdint>
 
 #include <c2h/custom_type.cuh>
@@ -132,7 +134,7 @@ struct random_to_item_t
 template <typename T>
 struct random_to_item_t<T, cub::FLOATING_POINT>
 {
-  using storage_t = cub::detail::conditional_t<(sizeof(T) > 4), double, float>;
+  using storage_t = ::cuda::std::_If<(sizeof(T) > 4), double, float>;
   storage_t m_min;
   storage_t m_max;
 
diff --git a/cub/test/catch2_test_block_run_length_decode.cu b/cub/test/catch2_test_block_run_length_decode.cu
index 824cd473102..db2166659f9 100644
--- a/cub/test/catch2_test_block_run_length_decode.cu
+++ b/cub/test/catch2_test_block_run_length_decode.cu
@@ -33,6 +33,8 @@
 #include <cub/iterator/transform_input_iterator.cuh>
 #include <cub/util_allocator.cuh>
 
+#include <cuda/std/type_traits>
+
 #include "catch2_test_helper.h"
 
 /******************************************************************************
@@ -161,7 +163,7 @@ public:
   {
     typename BlockLoadRunItemT::TempStorage load_uniques_storage;
     typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage;
-    cub::detail::conditional_t<TEST_RUN_OFFSETS_, typename BlockRunOffsetScanT::TempStorage, cub::NullType>
+    ::cuda::std::_If<TEST_RUN_OFFSETS_, typename BlockRunOffsetScanT::TempStorage, cub::NullType>
       run_offsets_scan_storage;
     struct
     {
diff --git a/cub/test/catch2_test_util_type.cu b/cub/test/catch2_test_util_type.cu
index 9d46c3028ed..ed201a6ea48 100644
--- a/cub/test/catch2_test_util_type.cu
+++ b/cub/test/catch2_test_util_type.cu
@@ -61,3 +61,20 @@ CUB_TEST("Tests non_void_value_t", "[util][type]")
   STATIC_REQUIRE(::cuda::std::is_same<int, //
                                       cub::detail::non_void_value_t<non_void_fancy_it, fallback_t>>::value);
 }
+
+CUB_DEFINE_DETECT_NESTED_TYPE(cat_detect, cat);
+
+struct HasCat
+{
+  using cat = int;
+};
+struct HasDog
+{
+  using dog = int;
+};
+
+CUB_TEST("Test CUB_DEFINE_DETECT_NESTED_TYPE", "[util][type]")
+{
+  STATIC_REQUIRE(cat_detect<HasCat>::value);
+  STATIC_REQUIRE(!cat_detect<HasDog>::value);
+}
diff --git a/cub/test/catch2_test_warp_merge_sort.cu b/cub/test/catch2_test_warp_merge_sort.cu
index 6db81fb33c4..faab58054d1 100644
--- a/cub/test/catch2_test_warp_merge_sort.cu
+++ b/cub/test/catch2_test_warp_merge_sort.cu
@@ -31,6 +31,8 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
+#include <cuda/std/type_traits>
+
 #include <algorithm>
 
 #include "c2h/custom_type.cuh"
@@ -403,7 +405,7 @@ CUB_TEST(
 {
   using params             = params_t<TestType>;
   using type               = typename params::type;
-  using warp_sort_delegate = cub::detail::conditional_t<params::is_stable, warp_stable_sort_keys_t, warp_sort_keys_t>;
+  using warp_sort_delegate = ::cuda::std::_If<params::is_stable, warp_stable_sort_keys_t, warp_sort_keys_t>;
 
   // Prepare test data
   c2h::device_vector<type> d_in(params::tile_size);
@@ -434,7 +436,7 @@ CUB_TEST("Warp sort keys-only on partial warp-tile works",
   using params = params_t<TestType>;
   using type   = typename params::type;
   using warp_sort_delegate =
-    cub::detail::conditional_t<params::is_stable, warp_partial_stable_sort_keys_t, warp_partial_sort_keys_t>;
+    ::cuda::std::_If<params::is_stable, warp_partial_stable_sort_keys_t, warp_partial_sort_keys_t>;
 
   // Prepare test data
   c2h::device_vector<type> d_in(params::tile_size);
@@ -468,7 +470,7 @@ CUB_TEST("Warp sort on keys-value pairs works",
   using params             = params_t<TestType>;
   using key_type           = typename params::type;
   using value_type         = typename c2h::get<4, TestType>;
-  using warp_sort_delegate = cub::detail::conditional_t<params::is_stable, warp_stable_sort_pairs_t, warp_sort_pairs_t>;
+  using warp_sort_delegate = ::cuda::std::_If<params::is_stable, warp_stable_sort_pairs_t, warp_sort_pairs_t>;
 
   // Prepare test data
   c2h::device_vector<key_type> d_keys_in(params::tile_size);
@@ -511,7 +513,7 @@ CUB_TEST("Warp sort on key-value pairs of a partial warp-tile works",
   using key_type   = typename params::type;
   using value_type = typename c2h::get<4, TestType>;
   using warp_sort_delegate =
-    cub::detail::conditional_t<params::is_stable, warp_partial_stable_sort_pairs_t, warp_partial_sort_pairs_t>;
+    ::cuda::std::_If<params::is_stable, warp_partial_stable_sort_pairs_t, warp_partial_sort_pairs_t>;
 
   // Prepare test data
   c2h::device_vector<key_type> d_keys_in(params::tile_size);
diff --git a/cub/test/catch2_test_warp_reduce.cu b/cub/test/catch2_test_warp_reduce.cu
index 9a075551d37..55c3ed3e532 100644
--- a/cub/test/catch2_test_warp_reduce.cu
+++ b/cub/test/catch2_test_warp_reduce.cu
@@ -476,8 +476,8 @@ CUB_TEST("Warp segmented sum works", "[reduce][warp]", full_type_list, logical_w
   constexpr auto segmented_mod = c2h::get<2, TestType>::value;
   static_assert(segmented_mod == reduce_mode::tail_flags || segmented_mod == reduce_mode::head_flags,
                 "Segmented tests must either be head or tail flags");
-  using warp_seg_sum_t = cub::detail::
-    conditional_t<(segmented_mod == reduce_mode::tail_flags), warp_seg_sum_tail_t<type>, warp_seg_sum_head_t<type>>;
+  using warp_seg_sum_t =
+    ::cuda::std::_If<(segmented_mod == reduce_mode::tail_flags), warp_seg_sum_tail_t<type>, warp_seg_sum_head_t<type>>;
 
   // Prepare test data
   c2h::device_vector<type> d_in(params::tile_size);
@@ -521,9 +521,9 @@ CUB_TEST("Warp segmented reduction works", "[reduce][warp]", builtin_type_list,
   static_assert(segmented_mod == reduce_mode::tail_flags || segmented_mod == reduce_mode::head_flags,
                 "Segmented tests must either be head or tail flags");
   using warp_seg_reduction_t =
-    cub::detail::conditional_t<(segmented_mod == reduce_mode::tail_flags),
-                               warp_seg_reduce_tail_t<type, red_op_t>,
-                               warp_seg_reduce_head_t<type, red_op_t>>;
+    ::cuda::std::_If<(segmented_mod == reduce_mode::tail_flags),
+                     warp_seg_reduce_tail_t<type, red_op_t>,
+                     warp_seg_reduce_head_t<type, red_op_t>>;
 
   // Prepare test data
   c2h::device_vector<type> d_in(params::tile_size);
diff --git a/cub/test/test_device_spmv.cu b/cub/test/test_device_spmv.cu
index 24a4befd7f2..2e2699dd17c 100644
--- a/cub/test/test_device_spmv.cu
+++ b/cub/test/test_device_spmv.cu
@@ -37,6 +37,8 @@
 #include <thrust/mismatch.h>
 #include <thrust/scan.h>
 
+#include <cuda/std/type_traits>
+
 #include <iostream>
 #include <type_traits>
 #include <typeinfo>
@@ -205,7 +207,7 @@ struct csr_matrix
 
 private:
   template <typename VecValueT>
-  using vector_t = cub::detail::conditional_t<HostStorage, c2h::host_vector<VecValueT>, c2h::device_vector<VecValueT>>;
+  using vector_t = ::cuda::std::_If<HostStorage, c2h::host_vector<VecValueT>, c2h::device_vector<VecValueT>>;
 
   vector_t<ValueT> m_values;
   vector_t<int> m_row_offsets;
diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst
index 904ecb6cfe0..0a1163bbc1a 100644
--- a/docs/cub/developer_overview.rst
+++ b/docs/cub/developer_overview.rst
@@ -237,7 +237,7 @@ For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implement
 
 .. code-block:: c++
 
-    using InternalWarpReduce = cub::detail::conditional_t<
+    using InternalWarpReduce = cuda::std::conditional_t<
       IS_POW_OF_TWO,
       WarpReduceShfl<T, LOGICAL_WARP_THREADS>,  // shuffle-based implementation
       WarpReduceSmem<T, LOGICAL_WARP_THREADS>>; // smem-based implementation