diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
index 89e4bc485e9..778e8b44bc0 100644
--- a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
+++ b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
@@ -52,20 +52,28 @@ struct policy_hub_t
 template <class T, class OffsetT>
 void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
-  constexpr bool may_alias = false;
-  constexpr bool read_left = true;
-
   using input_it_t      = const T*;
   using output_it_t     = T*;
   using difference_op_t = ::cuda::std::minus<>;
   using offset_t        = cub::detail::choose_offset_t<OffsetT>;
 
 #if !TUNE_BASE
-  using dispatch_t = cub::
-    DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left, policy_hub_t>;
+  using dispatch_t = cub::DispatchAdjacentDifference<
+    input_it_t,
+    output_it_t,
+    difference_op_t,
+    offset_t,
+    cub::MayAlias::No,
+    cub::ReadOption::Left,
+    policy_hub_t>;
 #else
-  using dispatch_t =
-    cub::DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left>;
+  using dispatch_t = cub::DispatchAdjacentDifference<
+    input_it_t,
+    output_it_t,
+    difference_op_t,
+    offset_t,
+    cub::MayAlias::No,
+    cub::ReadOption::Left>;
 #endif // TUNE_BASE
 
   const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu
index 07162bf602a..9a2d6c09afb 100644
--- a/cub/benchmarks/bench/copy/memcpy.cu
+++ b/cub/benchmarks/bench/copy/memcpy.cu
@@ -25,7 +25,7 @@
  *
  ******************************************************************************/
 
-#include <cub/device/device_copy.cuh>
+#include <cub/device/device_memcpy.cuh>
 
 // %RANGE% TUNE_THREADS tpb 128:1024:32
 // %RANGE% TUNE_BUFFERS_PER_THREAD bpt 1:18:1
@@ -184,8 +184,6 @@ void copy(nvbench::state& state,
   using buffer_offset_t    = std::uint32_t;
   using block_offset_t     = std::uint32_t;
 
-  constexpr bool is_memcpy = true;
-
 #if !TUNE_BASE
   using policy_t = policy_hub_t;
 #else
@@ -199,7 +197,7 @@ void copy(nvbench::state& state,
     buffer_offset_t,
     block_offset_t,
     policy_t,
-    is_memcpy>;
+    cub::CopyAlg::Memcpy>;
 
   thrust::device_vector<T> input_buffer = generate(elements);
   thrust::device_vector<T> output_buffer(elements);
diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index b2ecd4f561c..7217ee32e6e 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -42,9 +42,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = true;
-constexpr bool may_alias    = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -119,8 +116,7 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias
+    cub::SelectImpl::Partition
 #if !TUNE_BASE
     ,
     policy_hub_t<T>
diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
index 7d7d686b030..f4a15839668 100644
--- a/cub/benchmarks/bench/partition/if.cu
+++ b/cub/benchmarks/bench/partition/if.cu
@@ -42,9 +42,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = true;
-constexpr bool may_alias    = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -145,8 +142,7 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias
+    cub::SelectImpl::Partition
 #if !TUNE_BASE
     ,
     policy_hub_t<T>
diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu
index bd04bcf3d43..27ce9f3b834 100644
--- a/cub/benchmarks/bench/radix_sort/keys.cu
+++ b/cub/benchmarks/bench/radix_sort/keys.cu
@@ -41,8 +41,8 @@
 
 using value_t = cub::NullType;
 
-constexpr bool is_descending   = false;
-constexpr bool is_overwrite_ok = false;
+constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
+constexpr bool is_overwrite_ok      = false;
 
 #if !TUNE_BASE
 template <typename KeyT, typename ValueT, typename OffsetT>
@@ -105,10 +105,10 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using portion_offset  = int;
   using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
   using agent_radix_sort_onesweep_t =
-    cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
+    cub::AgentRadixSortOnesweep<onesweep_policy, sort_order, KeyT, ValueT, OffsetT, portion_offset>;
 
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
-  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
+  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, sort_order, KeyT, OffsetT>;
 
   return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
                             sizeof(typename hist_agent::TempStorage));
@@ -144,9 +144,9 @@ void radix_sort_keys(std::integral_constant<bool, true>, nvbench::state& state,
   using key_t = T;
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<key_t, value_t, offset_t>;
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t, policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t>;
 #endif // TUNE_BASE
 
   constexpr int begin_bit = 0;
@@ -207,7 +207,7 @@ void radix_sort_keys(std::integral_constant<bool, true>, nvbench::state& state,
 template <typename T, typename OffsetT>
 void radix_sort_keys(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<T, OffsetT>)
 {
-  (void) is_descending;
+  (void) sort_order;
   (void) is_overwrite_ok;
 }
 
diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu
index 35d589f453e..160a515466e 100644
--- a/cub/benchmarks/bench/radix_sort/pairs.cu
+++ b/cub/benchmarks/bench/radix_sort/pairs.cu
@@ -39,8 +39,8 @@
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 
-constexpr bool is_descending   = false;
-constexpr bool is_overwrite_ok = false;
+constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
+constexpr bool is_overwrite_ok      = false;
 
 #if !TUNE_BASE
 template <typename KeyT, typename ValueT, typename OffsetT>
@@ -103,10 +103,10 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using portion_offset  = int;
   using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
   using agent_radix_sort_onesweep_t =
-    cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
+    cub::AgentRadixSortOnesweep<onesweep_policy, sort_order, KeyT, ValueT, OffsetT, portion_offset>;
 
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
-  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
+  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, sort_order, KeyT, OffsetT>;
 
   return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
                             sizeof(typename hist_agent::TempStorage));
@@ -144,9 +144,9 @@ void radix_sort_values(
   using value_t = ValueT;
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<key_t, value_t, offset_t>;
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t, policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t>;
 #endif // TUNE_BASE
 
   constexpr int begin_bit = 0;
@@ -212,7 +212,7 @@ void radix_sort_values(
 template <typename KeyT, typename ValueT, typename OffsetT>
 void radix_sort_values(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<KeyT, ValueT, OffsetT>)
 {
-  (void) is_descending;
+  (void) sort_order;
   (void) is_overwrite_ok;
 }
 
diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu
index 8d793c67e44..7a5f05c00e8 100644
--- a/cub/benchmarks/bench/segmented_sort/keys.cu
+++ b/cub/benchmarks/bench/segmented_sort/keys.cu
@@ -153,8 +153,8 @@ void seg_sort(nvbench::state& state,
               const thrust::device_vector<OffsetT>& offsets,
               bit_entropy entropy)
 {
-  constexpr bool is_descending   = false;
-  constexpr bool is_overwrite_ok = false;
+  constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
+  constexpr bool is_overwrite_ok      = false;
 
   using offset_t          = OffsetT;
   using begin_offset_it_t = const offset_t*;
@@ -165,10 +165,10 @@ void seg_sort(nvbench::state& state,
 #if !TUNE_BASE
   using policy_t   = device_seg_sort_policy_hub<key_t>;
   using dispatch_t = //
-    cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t, policy_t>;
+    cub::DispatchSegmentedSort<sort_order, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t, policy_t>;
 #else
   using dispatch_t = //
-    cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t>;
+    cub::DispatchSegmentedSort<sort_order, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t>;
 #endif
 
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu
index 3a180a65adc..2562cec6c76 100644
--- a/cub/benchmarks/bench/select/flagged.cu
+++ b/cub/benchmarks/bench/select/flagged.cu
@@ -40,8 +40,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -78,17 +76,18 @@ struct policy_hub_t
 };
 #endif // !TUNE_BASE
 
-template <typename T, typename OffsetT, typename MayAlias>
-void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
+template <typename T, typename OffsetT, typename InPlace>
+void select(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlace>)
 {
-  using input_it_t         = const T*;
-  using flag_it_t          = const bool*;
-  using output_it_t        = T*;
-  using num_selected_it_t  = OffsetT*;
-  using select_op_t        = cub::NullType;
-  using equality_op_t      = cub::NullType;
-  using offset_t           = OffsetT;
-  constexpr bool may_alias = MayAlias::value;
+  using input_it_t        = const T*;
+  using flag_it_t         = const bool*;
+  using output_it_t       = T*;
+  using num_selected_it_t = OffsetT*;
+  using select_op_t       = cub::NullType;
+  using equality_op_t     = cub::NullType;
+  using offset_t          = OffsetT;
+  constexpr cub::SelectImpl selection_option =
+    InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -100,20 +99,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias,
+    selection_option,
     policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    keep_rejects,
-    may_alias>;
+  using dispatch_t =
+    cub::DispatchSelectIf<input_it_t,
+                          flag_it_t,
+                          output_it_t,
+                          num_selected_it_t,
+                          select_op_t,
+                          equality_op_t,
+                          offset_t,
+                          selection_option>;
 #endif // !TUNE_BASE
 
   // Retrieve axis parameters
@@ -165,18 +162,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
 
 using ::cuda::std::false_type;
 using ::cuda::std::true_type;
-#ifdef TUNE_MayAlias
-using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
-#else // !defined(TUNE_MayAlias)
-using may_alias = nvbench::type_list<false_type, true_type>;
-#endif // TUNE_MayAlias
+#ifdef TUNE_InPlace
+using is_in_place = nvbench::type_list<TUNE_InPlace>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_InPlace)
+using is_in_place = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_InPlace
 
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;
 
-NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias))
+NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu
index 03d6d57a1ad..c391c76eb09 100644
--- a/cub/benchmarks/bench/select/if.cu
+++ b/cub/benchmarks/bench/select/if.cu
@@ -42,8 +42,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -105,17 +103,18 @@ T value_from_entropy(double percentage)
   return static_cast<T>(result);
 }
 
-template <typename T, typename OffsetT, typename MayAlias>
-void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
+template <typename T, typename OffsetT, typename InPlace>
+void select(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlace>)
 {
-  using input_it_t         = const T*;
-  using flag_it_t          = cub::NullType*;
-  using output_it_t        = T*;
-  using num_selected_it_t  = OffsetT*;
-  using select_op_t        = less_then_t<T>;
-  using equality_op_t      = cub::NullType;
-  using offset_t           = OffsetT;
-  constexpr bool may_alias = MayAlias::value;
+  using input_it_t        = const T*;
+  using flag_it_t         = cub::NullType*;
+  using output_it_t       = T*;
+  using num_selected_it_t = OffsetT*;
+  using select_op_t       = less_then_t<T>;
+  using equality_op_t     = cub::NullType;
+  using offset_t          = OffsetT;
+  constexpr cub::SelectImpl selection_option =
+    InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -127,20 +126,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias,
+    selection_option,
     policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    keep_rejects,
-    may_alias>;
+  using dispatch_t =
+    cub::DispatchSelectIf<input_it_t,
+                          flag_it_t,
+                          output_it_t,
+                          num_selected_it_t,
+                          select_op_t,
+                          equality_op_t,
+                          offset_t,
+                          selection_option>;
 #endif // TUNE_BASE
 
   // Retrieve axis parameters
@@ -191,18 +188,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
 
 using ::cuda::std::false_type;
 using ::cuda::std::true_type;
-#ifdef TUNE_MayAlias
-using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
-#else // !defined(TUNE_MayAlias)
-using may_alias = nvbench::type_list<false_type, true_type>;
-#endif // TUNE_MayAlias
+#ifdef TUNE_InPlace
+using is_in_place = nvbench::type_list<TUNE_InPlace>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_InPlace)
+using is_in_place = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_InPlace
 
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;
 
-NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias))
+NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu
index 110fe4d8fd2..88f469ae953 100644
--- a/cub/benchmarks/bench/select/unique.cu
+++ b/cub/benchmarks/bench/select/unique.cu
@@ -16,8 +16,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -54,17 +52,18 @@ struct policy_hub_t
 };
 #endif // !TUNE_BASE
 
-template <typename T, typename OffsetT, typename MayAlias>
-static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
+template <typename T, typename OffsetT, typename InPlace>
+static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlace>)
 {
-  using input_it_t         = const T*;
-  using flag_it_t          = cub::NullType*;
-  using output_it_t        = T*;
-  using num_selected_it_t  = OffsetT*;
-  using select_op_t        = cub::NullType;
-  using equality_op_t      = ::cuda::std::equal_to<>;
-  using offset_t           = OffsetT;
-  constexpr bool may_alias = MayAlias::value;
+  using input_it_t        = const T*;
+  using flag_it_t         = cub::NullType*;
+  using output_it_t       = T*;
+  using num_selected_it_t = OffsetT*;
+  using select_op_t       = cub::NullType;
+  using equality_op_t     = ::cuda::std::equal_to<>;
+  using offset_t          = OffsetT;
+  constexpr cub::SelectImpl selection_option =
+    InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -76,20 +75,18 @@ static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlia
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias,
+    selection_option,
     policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    keep_rejects,
-    may_alias>;
+  using dispatch_t =
+    cub::DispatchSelectIf<input_it_t,
+                          flag_it_t,
+                          output_it_t,
+                          num_selected_it_t,
+                          select_op_t,
+                          equality_op_t,
+                          offset_t,
+                          selection_option>;
 #endif // TUNE_BASE
 
   // Retrieve axis parameters
@@ -143,18 +140,18 @@ static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlia
 
 using ::cuda::std::false_type;
 using ::cuda::std::true_type;
-#ifdef TUNE_MayAlias
-using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
-#else // !defined(TUNE_MayAlias)
-using may_alias = nvbench::type_list<false_type, true_type>;
-#endif // TUNE_MayAlias
+#ifdef TUNE_InPlace
+using is_in_place = nvbench::type_list<TUNE_InPlace>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_InPlace)
+using is_in_place = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_InPlace
 
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;
 
-NVBENCH_BENCH_TYPES(unique, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias))
+NVBENCH_BENCH_TYPES(unique, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h
index 3f8ad71f590..e0da426ccfd 100644
--- a/cub/benchmarks/bench/transform/common.h
+++ b/cub/benchmarks/bench/transform/common.h
@@ -80,7 +80,7 @@ void bench_transform(
 {
   state.exec(exec_tag, [&](const nvbench::launch& launch) {
     cub::detail::transform::dispatch_t<
-      false,
+      cub::detail::transform::requires_stable_address::no,
       OffsetT,
       ::cuda::std::tuple<RandomAccessIteratorsIn...>,
       RandomAccessIteratorOut,
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index b1785651f12..d7036313495 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -49,6 +49,7 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_type.cuh>
@@ -177,8 +178,9 @@ struct partition_distinct_output_t
  *    num_total_items() -> total number of items across all partitions (partition only)
  *    update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected
  *
- * @tparam KeepRejects
- *   Whether or not we push rejected items to the back of the output
+ * @tparam SelectImpl SelectionOpt
+ *   SelectImpl indicating whether to partition, just selection or selection where the memory for the input and
+ *   output may alias each other.
  */
 template <typename AgentSelectIfPolicyT,
           typename InputIteratorT,
@@ -188,8 +190,7 @@ template <typename AgentSelectIfPolicyT,
           typename EqualityOpT,
           typename OffsetT,
           typename StreamingContextT,
-          bool KeepRejects,
-          bool MayAlias>
+          SelectImpl SelectionOpt>
 struct AgentSelectIf
 {
   //---------------------------------------------------------------------
@@ -208,7 +209,9 @@ struct AgentSelectIf
   // updating a tile state. Similarly, we need to make sure that the load of previous tile states precede writing of
   // the stream-compacted items and, hence, we need a load acquire when reading those tile states.
   static constexpr MemoryOrder memory_order =
-    ((!KeepRejects) && MayAlias && (!loads_via_smem)) ? MemoryOrder::acquire_release : MemoryOrder::relaxed;
+    ((SelectionOpt == SelectImpl::SelectPotentiallyInPlace) && (!loads_via_smem))
+      ? MemoryOrder::acquire_release
+      : MemoryOrder::relaxed;
 
   // If we need to enforce memory order for in-place stream compaction, wrap the default decoupled look-back tile
   // state in a helper class that enforces memory order on reads and writes
@@ -847,7 +850,7 @@ struct AgentSelectIf
       0,
       0,
       num_tile_selections,
-      cub::Int2Type<KeepRejects>{});
+      cub::Int2Type < SelectionOpt == SelectImpl::Partition > {});
 
     return num_tile_selections;
   }
@@ -930,7 +933,7 @@ struct AgentSelectIf
       num_selections_prefix,
       num_rejected_prefix,
       num_selections,
-      cub::Int2Type<KeepRejects>{});
+      cub::Int2Type < SelectionOpt == SelectImpl::Partition > {});
 
     return num_selections;
   }
@@ -1019,33 +1022,4 @@ struct AgentSelectIf
 } // namespace select
 } // namespace detail
 
-template <typename SelectedOutputItT, typename RejectedOutputItT>
-using partition_distinct_output_t CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the "
-                                                          "public interface will be removed.") =
-  detail::select::partition_distinct_output_t<SelectedOutputItT, RejectedOutputItT>;
-
-template <typename AgentSelectIfPolicyT,
-          typename InputIteratorT,
-          typename FlagsInputIteratorT,
-          typename OutputIteratorWrapperT,
-          typename SelectOpT,
-          typename EqualityOpT,
-          typename OffsetT,
-          typename StreamingContextT,
-          bool KeepRejects,
-          bool MayAlias>
-using AgentSelectIf CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public "
-                                            "interface will be removed.") =
-  detail::select::AgentSelectIf<
-    AgentSelectIfPolicyT,
-    InputIteratorT,
-    FlagsInputIteratorT,
-    OutputIteratorWrapperT,
-    SelectOpT,
-    EqualityOpT,
-    OffsetT,
-    StreamingContextT,
-    KeepRejects,
-    MayAlias>;
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh
index b910bb91a2b..9548739885f 100644
--- a/cub/cub/device/device_adjacent_difference.cuh
+++ b/cub/cub/device/device_adjacent_difference.cuh
@@ -110,8 +110,8 @@ CUB_NAMESPACE_BEGIN
 struct DeviceAdjacentDifference
 {
 private:
-  template <bool may_alias,
-            bool read_left,
+  template <MayAlias AliasOpt,
+            ReadOption ReadOpt,
             typename NumItemsT,
             typename InputIteratorT,
             typename OutputIteratorT,
@@ -128,7 +128,7 @@ private:
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     using DispatchT =
-      DispatchAdjacentDifference<InputIteratorT, OutputIteratorT, DifferenceOpT, OffsetT, may_alias, read_left>;
+      DispatchAdjacentDifference<InputIteratorT, OutputIteratorT, DifferenceOpT, OffsetT, AliasOpt, ReadOpt>;
 
     return DispatchT::Dispatch(
       d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream);
@@ -257,10 +257,7 @@ public:
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy");
 
-    constexpr bool may_alias = false;
-    constexpr bool read_left = true;
-
-    return AdjacentDifference<may_alias, read_left>(
+    return AdjacentDifference<MayAlias::No, ReadOption::Left>(
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
@@ -370,10 +367,7 @@ public:
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft");
 
-    constexpr bool may_alias = true;
-    constexpr bool read_left = true;
-
-    return AdjacentDifference<may_alias, read_left>(
+    return AdjacentDifference<MayAlias::Yes, ReadOption::Left>(
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
 
@@ -500,10 +494,7 @@ public:
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRightCopy");
 
-    constexpr bool may_alias = false;
-    constexpr bool read_left = false;
-
-    return AdjacentDifference<may_alias, read_left>(
+    return AdjacentDifference<MayAlias::No, ReadOption::Right>(
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
@@ -602,10 +593,7 @@ public:
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRight");
 
-    constexpr bool may_alias = true;
-    constexpr bool read_left = false;
-
-    return AdjacentDifference<may_alias, read_left>(
+    return AdjacentDifference<MayAlias::Yes, ReadOption::Right>(
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
 };
diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh
index bc8cd343b18..a13f38591a6 100644
--- a/cub/cub/device/device_copy.cuh
+++ b/cub/cub/device/device_copy.cuh
@@ -190,7 +190,7 @@ struct DeviceCopy
       RangeOffsetT,
       BlockOffsetT,
       detail::batch_memcpy::policy_hub<RangeOffsetT, BlockOffsetT>,
-      false>::Dispatch(d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
+      CopyAlg::Copy>::Dispatch(d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
   }
 };
 
diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh
index dae104ce51f..70f96d092dd 100644
--- a/cub/cub/device/device_memcpy.cuh
+++ b/cub/cub/device/device_memcpy.cuh
@@ -197,13 +197,13 @@ struct DeviceMemcpy
       BufferOffsetT,
       BlockOffsetT,
       detail::batch_memcpy::policy_hub<BufferOffsetT, BlockOffsetT>,
-      true>::Dispatch(d_temp_storage,
-                      temp_storage_bytes,
-                      input_buffer_it,
-                      output_buffer_it,
-                      buffer_sizes,
-                      num_buffers,
-                      stream);
+      CopyAlg::Memcpy>::Dispatch(d_temp_storage,
+                                 temp_storage_bytes,
+                                 input_buffer_it,
+                                 output_buffer_it,
+                                 buffer_sizes,
+                                 num_buffers,
+                                 stream);
   }
 };
 
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 768d8413e6f..b381800ff65 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -200,7 +200,7 @@ struct DevicePartition
                        SelectOp,
                        EqualityOp,
                        OffsetT,
-                       true>;
+                       SelectImpl::Partition>;
 
     // Check if the number of items exceeds the range covered by the selected signed offset type
     cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
@@ -365,7 +365,7 @@ struct DevicePartition
                        SelectOp,
                        EqualityOp,
                        OffsetT,
-                       true>;
+                       SelectImpl::Partition>;
 
     return DispatchSelectIfT::Dispatch(
       d_temp_storage,
@@ -381,7 +381,7 @@ struct DevicePartition
   }
 
 private:
-  template <bool IS_DESCENDING,
+  template <SortOrder Order,
             typename KeyT,
             typename ValueT,
             typename OffsetT,
diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh
index 897c897bfc7..d45e43890dc 100644
--- a/cub/cub/device/device_radix_sort.cuh
+++ b/cub/cub/device/device_radix_sort.cuh
@@ -123,7 +123,7 @@ CUB_NAMESPACE_BEGIN
 struct DeviceRadixSort
 {
 private:
-  template <bool IsDescending, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
     ::cuda::std::false_type,
     void* d_temp_storage,
@@ -137,7 +137,7 @@ private:
     int end_bit,
     cudaStream_t stream);
 
-  template <bool IsDescending, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
+  template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
     ::cuda::std::true_type,
     void* d_temp_storage,
@@ -151,25 +151,21 @@ private:
     int end_bit,
     cudaStream_t stream)
   {
-    return DispatchRadixSort<
-      IsDescending,
-      KeyT,
-      ValueT,
-      OffsetT,
-      detail::radix::policy_hub<KeyT, ValueT, OffsetT>,
-      DecomposerT>::Dispatch(d_temp_storage,
-                             temp_storage_bytes,
-                             d_keys,
-                             d_values,
-                             static_cast<OffsetT>(num_items),
-                             begin_bit,
-                             end_bit,
-                             is_overwrite_okay,
-                             stream,
-                             decomposer);
+    return DispatchRadixSort<Order, KeyT, ValueT, OffsetT, detail::radix::policy_hub<KeyT, ValueT, OffsetT>, DecomposerT>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        static_cast<OffsetT>(num_items),
+        begin_bit,
+        end_bit,
+        is_overwrite_okay,
+        stream,
+        decomposer);
   }
 
-  template <bool IsDescending, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
     ::cuda::std::false_type,
     void* d_temp_storage,
@@ -181,7 +177,7 @@ private:
     DecomposerT decomposer,
     cudaStream_t stream);
 
-  template <bool IsDescending, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
+  template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
     ::cuda::std::true_type,
     void* d_temp_storage,
@@ -196,7 +192,7 @@ private:
     constexpr int begin_bit = 0;
     const int end_bit       = detail::radix::traits_t<KeyT>::default_end_bit(decomposer);
 
-    return DeviceRadixSort::custom_radix_sort<IsDescending>(
+    return DeviceRadixSort::custom_radix_sort<Order>(
       ::cuda::std::true_type{},
       d_temp_storage,
       temp_storage_bytes,
@@ -354,7 +350,7 @@ public:
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
       d_temp_storage,
       temp_storage_bytes,
       d_keys,
@@ -500,11 +496,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = false;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -639,11 +634,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = false;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -786,7 +780,7 @@ public:
 
     constexpr bool is_overwrite_okay = true;
 
-    return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
@@ -909,9 +903,8 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = false;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -1055,9 +1048,8 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = false;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -1200,7 +1192,7 @@ public:
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
@@ -1340,11 +1332,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = true;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -1481,11 +1472,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = true;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -1623,7 +1613,7 @@ public:
 
     constexpr bool is_overwrite_okay = true;
 
-    return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
@@ -1747,9 +1737,8 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = true;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -1894,9 +1883,8 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = true;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -2035,7 +2023,7 @@ public:
     // Null value type
     DoubleBuffer<NullType> d_values;
 
-    return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
       d_temp_storage,
       temp_storage_bytes,
       d_keys,
@@ -2168,11 +2156,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = false;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -2297,11 +2284,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = false;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -2429,7 +2415,7 @@ public:
     // Null value type
     DoubleBuffer<NullType> d_values;
 
-    return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
@@ -2539,10 +2525,9 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = false;
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -2673,10 +2658,9 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = false;
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -2804,7 +2788,7 @@ public:
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
 
-    return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
@@ -2930,11 +2914,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = true;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -3057,11 +3040,10 @@ public:
     // create a new double-buffer internally when the `is_overwrite_ok` flag
     // is not set.
     constexpr bool is_overwrite_okay = false;
-    constexpr bool is_descending     = true;
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -3184,7 +3166,7 @@ public:
     // Null value type
     DoubleBuffer<NullType> d_values;
 
-    return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+    return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
@@ -3295,10 +3277,9 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = true;
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
@@ -3430,10 +3411,9 @@ public:
                   "arithmetic types");
 
     constexpr bool is_overwrite_okay = true;
-    constexpr bool is_descending     = true;
     DoubleBuffer<NullType> d_values;
 
-    return DeviceRadixSort::custom_radix_sort<is_descending>(
+    return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
       decomposer_check_t{},
       d_temp_storage,
       temp_storage_bytes,
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 0c1638bd955..63129dec5a6 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -1157,7 +1157,6 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
     using AccumT  = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
-    constexpr bool ForceInclusive = true;
 
     return DispatchScan<
       InputIteratorT,
@@ -1167,14 +1166,14 @@ struct DeviceScan
       OffsetT,
       AccumT,
       detail::scan::policy_hub<AccumT, ScanOpT>,
-      ForceInclusive>::Dispatch(d_temp_storage,
-                                temp_storage_bytes,
-                                d_in,
-                                d_out,
-                                scan_op,
-                                detail::InputValue<InitValueT>(init_value),
-                                num_items,
-                                stream);
+      ForceInclusive::Yes>::Dispatch(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     scan_op,
+                                     detail::InputValue<InitValueT>(init_value),
+                                     num_items,
+                                     stream);
   }
 
   //! @rst
diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh
index ae47119bfa3..ef6d2369f22 100644
--- a/cub/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/cub/device/device_segmented_radix_sort.cuh
@@ -249,19 +249,20 @@ public:
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      false,
-      stream);
+    return DispatchSegmentedRadixSort<SortOrder::Ascending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        num_items,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        begin_bit,
+        end_bit,
+        false,
+        stream);
   }
 
   //! @rst
@@ -423,19 +424,20 @@ public:
     // Signed integer type for global offsets
     using OffsetT = int;
 
-    return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      true,
-      stream);
+    return DispatchSegmentedRadixSort<SortOrder::Ascending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        num_items,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        begin_bit,
+        end_bit,
+        true,
+        stream);
   }
 
   //! @rst
@@ -597,19 +599,24 @@ public:
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-    return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      false,
-      stream);
+    return DispatchSegmentedRadixSort<
+      SortOrder::Descending,
+      KeyT,
+      ValueT,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         false,
+                         stream);
   }
 
   //! @rst
@@ -775,19 +782,24 @@ public:
     // Signed integer type for global offsets
     using OffsetT = int;
 
-    return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      true,
-      stream);
+    return DispatchSegmentedRadixSort<
+      SortOrder::Descending,
+      KeyT,
+      ValueT,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         true,
+                         stream);
   }
 
   //! @}  end member group
@@ -936,19 +948,24 @@ public:
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
 
-    return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      false,
-      stream);
+    return DispatchSegmentedRadixSort<
+      SortOrder::Ascending,
+      KeyT,
+      NullType,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         false,
+                         stream);
   }
 
   //! @rst
@@ -1102,19 +1119,24 @@ public:
     // Null value type
     DoubleBuffer<NullType> d_values;
 
-    return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      true,
-      stream);
+    return DispatchSegmentedRadixSort<
+      SortOrder::Ascending,
+      KeyT,
+      NullType,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         true,
+                         stream);
   }
 
   //! @rst
@@ -1259,19 +1281,24 @@ public:
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
 
-    return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      false,
-      stream);
+    return DispatchSegmentedRadixSort<
+      SortOrder::Descending,
+      KeyT,
+      NullType,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         false,
+                         stream);
   }
 
   //! @rst
@@ -1423,19 +1450,24 @@ public:
     // Null value type
     DoubleBuffer<NullType> d_values;
 
-    return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      true,
-      stream);
+    return DispatchSegmentedRadixSort<
+      SortOrder::Descending,
+      KeyT,
+      NullType,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         true,
+                         stream);
   }
 
   //! @}  end member group
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index 2347666289e..2816cf562ff 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -149,13 +149,12 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = false;
 
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
@@ -325,13 +324,12 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = false;
 
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<NullType> d_values;
@@ -495,12 +493,11 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = true;
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<NullType> d_values;
 
@@ -666,12 +663,11 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = true;
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<NullType> d_values;
 
@@ -1388,13 +1384,12 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = false;
 
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
@@ -1591,13 +1586,12 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = false;
 
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
     DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
@@ -1788,13 +1782,12 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = false;
     constexpr bool is_overwrite_okay = true;
 
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     return DispatchT::Dispatch(
       d_temp_storage,
@@ -1987,13 +1980,12 @@ private:
     EndOffsetIteratorT d_end_offsets,
     cudaStream_t stream = 0)
   {
-    constexpr bool is_descending     = true;
     constexpr bool is_overwrite_okay = true;
 
     using OffsetT =
       detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
     using DispatchT =
-      DispatchSegmentedSort<is_descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
+      DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     return DispatchT::Dispatch(
       d_temp_storage,
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 7d5099ca7e1..3cb0c2aeb73 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -190,8 +190,7 @@ struct DeviceSelect
       SelectOp,
       EqualityOp,
       OffsetT,
-      /*KeepRejects*/ false,
-      /*MayAlias*/ false>::Dispatch(d_temp_storage,
+      SelectImpl::Select>::Dispatch(d_temp_storage,
                                     temp_storage_bytes,
                                     d_in,
                                     d_flags,
@@ -298,25 +297,25 @@ struct DeviceSelect
     using SelectOp   = NullType; // Selection op (not used)
     using EqualityOp = NullType; // Equality operator (not used)
 
-    return DispatchSelectIf<
-      IteratorT,
-      FlagIterator,
-      IteratorT,
-      NumSelectedIteratorT,
-      SelectOp,
-      EqualityOp,
-      OffsetT,
-      /*KeepRejects*/ false,
-      /*MayAlias*/ true>::Dispatch(d_temp_storage,
-                                   temp_storage_bytes,
-                                   d_data, // in
-                                   d_flags,
-                                   d_data, // out
-                                   d_num_selected_out,
-                                   SelectOp(),
-                                   EqualityOp(),
-                                   num_items,
-                                   stream);
+    return DispatchSelectIf<IteratorT,
+                            FlagIterator,
+                            IteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            SelectImpl::SelectPotentiallyInPlace>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_data, // in
+        d_flags,
+        d_data, // out
+        d_num_selected_out,
+        SelectOp(),
+        EqualityOp(),
+        num_items,
+        stream);
   }
 
   //! @rst
@@ -445,8 +444,7 @@ struct DeviceSelect
       SelectOp,
       EqualityOp,
       OffsetT,
-      /*KeepRejects*/ false,
-      /*MayAlias*/ false>::Dispatch(d_temp_storage,
+      SelectImpl::Select>::Dispatch(d_temp_storage,
                                     temp_storage_bytes,
                                     d_in,
                                     nullptr,
@@ -565,27 +563,25 @@ struct DeviceSelect
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using EqualityOp   = NullType; // Equality operator (not used)
 
-    constexpr bool may_alias = true;
-
-    return DispatchSelectIf<
-      IteratorT,
-      FlagIterator,
-      IteratorT,
-      NumSelectedIteratorT,
-      SelectOp,
-      EqualityOp,
-      OffsetT,
-      /*KeepRejects*/ false,
-      /*MayAlias*/ may_alias>::Dispatch(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_data, // in
-                                        nullptr,
-                                        d_data, // out
-                                        d_num_selected_out,
-                                        select_op,
-                                        EqualityOp(),
-                                        num_items,
-                                        stream);
+    return DispatchSelectIf<IteratorT,
+                            FlagIterator,
+                            IteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            SelectImpl::SelectPotentiallyInPlace>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_data, // in
+        nullptr,
+        d_data, // out
+        d_num_selected_out,
+        select_op,
+        EqualityOp(),
+        num_items,
+        stream);
   }
 
   //! @rst
@@ -694,8 +690,7 @@ struct DeviceSelect
       SelectOp,
       EqualityOp,
       OffsetT,
-      /*KeepRejects*/ false,
-      /*MayAlias*/ false>::Dispatch(d_temp_storage,
+      SelectImpl::Select>::Dispatch(d_temp_storage,
                                     temp_storage_bytes,
                                     d_in,
                                     d_flags,
@@ -792,25 +787,25 @@ struct DeviceSelect
     using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using EqualityOp = NullType; // Equality operator (not used)
 
-    return DispatchSelectIf<
-      IteratorT,
-      FlagIterator,
-      IteratorT,
-      NumSelectedIteratorT,
-      SelectOp,
-      EqualityOp,
-      OffsetT,
-      /*KeepRejects*/ false,
-      /*MayAlias*/ true>::Dispatch(d_temp_storage,
-                                   temp_storage_bytes,
-                                   d_data, // in
-                                   d_flags,
-                                   d_data, // out
-                                   d_num_selected_out,
-                                   select_op,
-                                   EqualityOp(),
-                                   num_items,
-                                   stream);
+    return DispatchSelectIf<IteratorT,
+                            FlagIterator,
+                            IteratorT,
+                            NumSelectedIteratorT,
+                            SelectOp,
+                            EqualityOp,
+                            OffsetT,
+                            SelectImpl::SelectPotentiallyInPlace>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_data, // in
+        d_flags,
+        d_data, // out
+        d_num_selected_out,
+        select_op,
+        EqualityOp(),
+        num_items,
+        stream);
   }
 
   //! @rst
@@ -919,8 +914,7 @@ struct DeviceSelect
       SelectOp,
       EqualityOp,
       OffsetT,
-      /*KeepRejects*/ false,
-      /*MayAlias*/ false>::Dispatch(d_temp_storage,
+      SelectImpl::Select>::Dispatch(d_temp_storage,
                                     temp_storage_bytes,
                                     d_in,
                                     nullptr,
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 7c19fce3f52..89cd804ea79 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -74,10 +74,16 @@ struct DeviceTransform
       return error;
     }
 
-    return detail::transform::
-      dispatch_t<false, offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
-        dispatch(
-          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+    return detail::transform::dispatch_t<
+      detail::transform::requires_stable_address::no,
+      offset_t,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp>::dispatch(::cuda::std::move(inputs),
+                             ::cuda::std::move(output),
+                             num_items,
+                             ::cuda::std::move(transform_op),
+                             stream);
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -212,10 +218,16 @@ struct DeviceTransform
       return error;
     }
 
-    return detail::transform::
-      dispatch_t<true, offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
-        dispatch(
-          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+    return detail::transform::dispatch_t<
+      detail::transform::requires_stable_address::yes,
+      offset_t,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp>::dispatch(::cuda::std::move(inputs),
+                             ::cuda::std::move(output),
+                             num_items,
+                             ::cuda::std::move(transform_op),
+                             stream);
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 11ea8be82b8..8138f7c087d 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -39,6 +39,7 @@
 
 #include <cub/agent/agent_adjacent_difference.cuh>
 #include <cub/detail/type_traits.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/tuning/tuning_adjacent_difference.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_device.cuh>
@@ -106,13 +107,19 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
 
 } // namespace detail::adjacent_difference
 
+enum class ReadOption
+{
+  Left,
+  Right
+};
+
 template <typename InputIteratorT,
           typename OutputIteratorT,
           typename DifferenceOpT,
           typename OffsetT,
-          bool MayAlias,
-          bool ReadLeft,
-          typename PolicyHub = detail::adjacent_difference::policy_hub<InputIteratorT, MayAlias>>
+          MayAlias AliasOpt,
+          ReadOption ReadOpt,
+          typename PolicyHub = detail::adjacent_difference::policy_hub<InputIteratorT, AliasOpt == MayAlias::Yes>>
 struct DispatchAdjacentDifference
 {
   using InputT = typename std::iterator_traits<InputIteratorT>::value_type;
@@ -155,10 +162,10 @@ struct DispatchAdjacentDifference
       constexpr int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE;
       const int num_tiles     = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
 
-      std::size_t first_tile_previous_size = MayAlias * num_tiles * sizeof(InputT);
+      std::size_t first_tile_previous_size = (AliasOpt == MayAlias::Yes) * num_tiles * sizeof(InputT);
 
       void* allocations[1]            = {nullptr};
-      std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};
+      std::size_t allocation_sizes[1] = {(AliasOpt == MayAlias::Yes) * first_tile_previous_size};
 
       error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
 
@@ -187,10 +194,10 @@ struct DispatchAdjacentDifference
 
       auto first_tile_previous = reinterpret_cast<InputT*>(allocations[0]);
 
-      if (MayAlias)
+      _CCCL_IF_CONSTEXPR (AliasOpt == MayAlias::Yes)
       {
         using AgentDifferenceInitT =
-          detail::adjacent_difference::AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;
+          detail::adjacent_difference::AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadOpt == ReadOption::Left>;
 
         constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
         const int init_grid_size      = ::cuda::ceil_div(num_tiles, init_block_size);
@@ -236,15 +243,14 @@ struct DispatchAdjacentDifference
 
       THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
-        .doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel<
-                typename PolicyHub::MaxPolicy,
-                InputIteratorT,
-                OutputIteratorT,
-                DifferenceOpT,
-                OffsetT,
-                InputT,
-                MayAlias,
-                ReadLeft>,
+        .doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel < typename PolicyHub::MaxPolicy,
+              InputIteratorT,
+              OutputIteratorT,
+              DifferenceOpT,
+              OffsetT,
+              InputT,
+              AliasOpt == MayAlias::Yes,
+              ReadOpt == ReadOption::Left >,
               d_input,
               first_tile_previous,
               d_output,
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 16650f41eef..073105c473c 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -62,6 +62,12 @@
 
 CUB_NAMESPACE_BEGIN
 
+enum class CopyAlg
+{
+  Memcpy,
+  Copy
+};
+
 namespace detail
 {
 namespace batch_memcpy
@@ -92,7 +98,7 @@ template <typename ChainedPolicyT,
           typename BufferTileOffsetItT,
           typename TileT,
           typename TileOffsetT,
-          bool IsMemcpy>
+          CopyAlg MemcpyOpt>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel(
     InputBufferIt input_buffer_it,
@@ -106,8 +112,9 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
   using BufferSizeT   = value_t<BufferSizeIteratorT>;
   /// Internal load/store type. For byte-wise memcpy, a single-byte type
-  using AliasT = typename ::cuda::std::
-    conditional<IsMemcpy, std::iterator_traits<char*>, std::iterator_traits<value_t<InputBufferIt>>>::type::value_type;
+  using AliasT = typename ::cuda::std::conditional<MemcpyOpt == CopyAlg::Memcpy,
+                                                   std::iterator_traits<char*>,
+                                                   std::iterator_traits<value_t<InputBufferIt>>>::type::value_type;
   /// Types of the input and output buffers
   using InputBufferT  = value_t<InputBufferIt>;
   using OutputBufferT = value_t<OutputBufferIt>;
@@ -164,15 +171,17 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
       {
         if (thread_offset < buffer_sizes[buffer_id])
         {
-          const auto value = read_item<IsMemcpy, AliasT, InputBufferT>(input_buffer_it[buffer_id], thread_offset);
-          write_item<IsMemcpy, AliasT, OutputBufferT>(output_buffer_it[buffer_id], thread_offset, value);
+          const auto value = read_item < MemcpyOpt == CopyAlg::Memcpy, AliasT,
+                     InputBufferT > (input_buffer_it[buffer_id], thread_offset);
+          write_item<MemcpyOpt == CopyAlg::Memcpy, AliasT, OutputBufferT>(
+            output_buffer_it[buffer_id], thread_offset, value);
         }
         thread_offset += BLOCK_THREADS;
       }
     }
     else
     {
-      copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
+      copy_items<MemcpyOpt == CopyAlg::Memcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
         input_buffer_it[buffer_id],
         output_buffer_it[buffer_id],
         (::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
@@ -216,7 +225,7 @@ template <typename ChainedPolicyT,
           typename BlockOffsetT,
           typename BLevBufferOffsetTileState,
           typename BLevBlockOffsetTileState,
-          bool IsMemcpy>
+          CopyAlg MemcpyOpt>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel(
     InputBufferIt input_buffer_it,
@@ -250,7 +259,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLO
     BlockOffsetT,
     BLevBufferOffsetTileState,
     BLevBlockOffsetTileState,
-    IsMemcpy>;
+    MemcpyOpt == CopyAlg::Memcpy>;
 
   // Shared memory for AgentBatchMemcpy
   __shared__ typename AgentBatchMemcpyT::TempStorage temp_storage;
@@ -289,7 +298,7 @@ template <typename InputBufferIt,
           typename BufferOffsetT,
           typename BlockOffsetT,
           typename PolicyHub = batch_memcpy::policy_hub<BufferOffsetT, BlockOffsetT>,
-          bool IsMemcpy      = true>
+          CopyAlg MemcpyOpt  = CopyAlg::Memcpy>
 struct DispatchBatchMemcpy
 {
   //------------------------------------------------------------------------------
@@ -387,8 +396,10 @@ struct DispatchBatchMemcpy
     // The number of thread blocks (or tiles) required to process all of the given buffers
     BlockOffsetT num_tiles = ::cuda::ceil_div(num_buffers, TILE_SIZE);
 
-    using BlevBufferSrcsOutT          = ::cuda::std::_If<IsMemcpy, const void*, cub::detail::value_t<InputBufferIt>>;
-    using BlevBufferDstOutT           = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<OutputBufferIt>>;
+    using BlevBufferSrcsOutT =
+      ::cuda::std::_If<MemcpyOpt == CopyAlg::Memcpy, const void*, cub::detail::value_t<InputBufferIt>>;
+    using BlevBufferDstOutT =
+      ::cuda::std::_If<MemcpyOpt == CopyAlg::Memcpy, void*, cub::detail::value_t<OutputBufferIt>>;
     using BlevBufferSrcsOutItT        = BlevBufferSrcsOutT*;
     using BlevBufferDstsOutItT        = BlevBufferDstOutT*;
     using BlevBufferSizesOutItT       = BufferSizeT*;
@@ -479,7 +490,7 @@ struct DispatchBatchMemcpy
       BlockOffsetT,
       BLevBufferOffsetTileState,
       BLevBlockOffsetTileState,
-      IsMemcpy>;
+      MemcpyOpt>;
 
     auto multi_block_memcpy_kernel = detail::batch_memcpy::MultiBlockBatchMemcpyKernel<
       typename PolicyHub::MaxPolicy,
@@ -490,7 +501,7 @@ struct DispatchBatchMemcpy
       BlevBufferTileOffsetsOutItT,
       BLevBufferOffsetTileState,
       BlockOffsetT,
-      IsMemcpy>;
+      MemcpyOpt>;
 
     constexpr uint32_t BLEV_BLOCK_THREADS = ActivePolicyT::AgentLargeBufferPolicyT::BLOCK_THREADS;
 
diff --git a/cub/cub/device/dispatch/dispatch_common.cuh b/cub/cub/device/dispatch/dispatch_common.cuh
new file mode 100644
index 00000000000..a9ad5ad39ff
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_common.cuh
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+CUB_NAMESPACE_BEGIN
+
+// Options for specifying memory aliasing
+enum class MayAlias
+{
+  Yes,
+  No
+};
+
+// Options for specifying sorting order.
+enum class SortOrder
+{
+  Ascending,
+  Descending
+};
+
+// Options for specifying the behavior of the stream compaction algorithm.
+enum class SelectImpl
+{
+  // Stream compaction, discarding rejected items. It's guaranteed that memory of input and output are disjoint.
+  Select,
+  // Stream compaction, discarding rejected items. Memory of the input may be identical to the memory of the output.
+  SelectPotentiallyInPlace,
+  // Partition, keeping rejected items. It's guaranteed that memory of input and output are disjoint.
+  Partition
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index d0a08110618..e6081e56528 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -50,6 +50,7 @@
 #include <cub/agent/agent_radix_sort_upsweep.cuh>
 #include <cub/agent/agent_scan.cuh>
 #include <cub/block/block_radix_sort.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/util_debug.cuh>
@@ -90,8 +91,8 @@ namespace detail::radix_sort
  * @tparam ALT_DIGIT_BITS
  *   Whether or not to use the alternate (lower-bits) policy
  *
- * @tparam IS_DESCENDING
- *   Whether or not the sorted-order is high-to-low
+ * @tparam SortOrder
+ *   Whether to sort in ascending or descending order
  *
  * @tparam KeyT
  *   Key type
@@ -120,7 +121,7 @@ namespace detail::radix_sort
  */
 template <typename ChainedPolicyT,
           bool ALT_DIGIT_BITS,
-          bool IS_DESCENDING,
+          SortOrder Order,
           typename KeyT,
           typename OffsetT,
           typename DecomposerT = detail::identity_decomposer_t>
@@ -168,7 +169,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
   __syncthreads();
 
   // Write out digit counts (striped)
-  upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+  upsweep.template ExtractCounts<Order == SortOrder::Descending>(d_spine, gridDim.x, blockIdx.x);
 }
 
 /**
@@ -234,8 +235,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS),
  * @tparam ALT_DIGIT_BITS
  *   Whether or not to use the alternate (lower-bits) policy
  *
- * @tparam IS_DESCENDING
- *   Whether or not the sorted-order is high-to-low
+ * @tparam SortOrder
+ *   Whether to sort in ascending or descending order
  *
  * @tparam KeyT
  *   Key type
@@ -276,7 +277,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS),
  */
 template <typename ChainedPolicyT,
           bool ALT_DIGIT_BITS,
-          bool IS_DESCENDING,
+          SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
@@ -313,7 +314,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
 
   // Parameterize AgentRadixSortDownsweep type for the current configuration
   using AgentRadixSortDownsweepT = detail::radix_sort::
-    AgentRadixSortDownsweep<ActiveDownsweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
+    AgentRadixSortDownsweep<ActiveDownsweepPolicyT, Order == SortOrder::Descending, KeyT, ValueT, OffsetT, DecomposerT>;
 
   // Shared memory storage
   __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage;
@@ -334,8 +335,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
  * @tparam ChainedPolicyT
  *   Chained tuning policy
  *
- * @tparam IS_DESCENDING
- *   Whether or not the sorted-order is high-to-low
+ * @tparam SortOrder
+ *   Whether or not to use the alternate (lower-bits) policy
  *
  * @tparam KeyT
  *   Key type
@@ -368,7 +369,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
  *   The past-the-end (most-significant) bit index needed for key comparison
  */
 template <typename ChainedPolicyT,
-          bool IS_DESCENDING,
+          SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
@@ -429,7 +430,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
 
   // Get default (min/max) value for out-of-bounds keys
   bit_ordered_type default_key_bits =
-    IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);
+    Order == SortOrder::Descending ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);
 
   KeyT default_key = reinterpret_cast<KeyT&>(default_key_bits);
 
@@ -453,7 +454,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
   // Sort tile
   BlockRadixSortT(temp_storage.sort)
     .SortBlockedToStriped(
-      keys, values, current_bit, end_bit, Int2Type<IS_DESCENDING>(), Int2Type<KEYS_ONLY>(), decomposer);
+      keys, values, current_bit, end_bit, Int2Type<Order == SortOrder::Descending>(), Int2Type<KEYS_ONLY>(), decomposer);
 
 // Store keys and values
 #pragma unroll
@@ -480,8 +481,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
  * @tparam ALT_DIGIT_BITS
  *   Whether or not to use the alternate (lower-bits) policy
  *
- * @tparam IS_DESCENDING
- *   Whether or not the sorted-order is high-to-low
+ * @tparam SortOrder
+ *   Whether to sort in ascending or descending order
  *
  * @tparam KeyT
  *   Key type
@@ -533,7 +534,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
  */
 template <typename ChainedPolicyT,
           bool ALT_DIGIT_BITS,
-          bool IS_DESCENDING,
+          SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename BeginOffsetIteratorT,
@@ -580,8 +581,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
   using DigitScanT = BlockScan<OffsetT, BLOCK_THREADS>;
 
   // Downsweep type
-  using BlockDownsweepT =
-    detail::radix_sort::AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
+  using BlockDownsweepT = detail::radix_sort::
+    AgentRadixSortDownsweep<SegmentedPolicyT, Order == SortOrder::Descending, KeyT, ValueT, OffsetT, DecomposerT>;
 
   enum
   {
@@ -629,7 +630,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
 
   __syncthreads();
 
-  if (IS_DESCENDING)
+  if (Order == SortOrder::Descending)
   {
 // Reverse bin counts
 #pragma unroll
@@ -668,7 +669,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
     bin_offset[track] += segment_begin;
   }
 
-  if (IS_DESCENDING)
+  if (Order == SortOrder::Descending)
   {
 // Reverse bin offsets
 #pragma unroll
@@ -725,7 +726,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
  * Histogram kernel
  */
 template <typename ChainedPolicyT,
-          bool IS_DESCENDING,
+          SortOrder Order,
           typename KeyT,
           typename OffsetT,
           typename DecomposerT = detail::identity_decomposer_t>
@@ -734,15 +735,15 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
   OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {})
 {
   using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy;
-  using AgentT =
-    detail::radix_sort::AgentRadixSortHistogram<HistogramPolicyT, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
+  using AgentT           = detail::radix_sort::
+    AgentRadixSortHistogram<HistogramPolicyT, Order == SortOrder::Descending, KeyT, OffsetT, DecomposerT>;
   __shared__ typename AgentT::TempStorage temp_storage;
   AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit, decomposer);
   agent.Process();
 }
 
 template <typename ChainedPolicyT,
-          bool IS_DESCENDING,
+          SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
@@ -765,8 +766,14 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy
     DecomposerT decomposer = {})
 {
   using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy;
-  using AgentT          = detail::radix_sort::
-    AgentRadixSortOnesweep<OnesweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT, DecomposerT>;
+  using AgentT          = detail::radix_sort::AgentRadixSortOnesweep<
+             OnesweepPolicyT,
+             Order == SortOrder::Descending,
+             KeyT,
+             ValueT,
+             OffsetT,
+             PortionOffsetT,
+             DecomposerT>;
   __shared__ typename AgentT::TempStorage s;
 
   AgentT agent(
@@ -839,8 +846,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_b
 /**
  * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
  *
- * @tparam IS_DESCENDING
- *   Whether or not the sorted-order is high-to-low
+ * @tparam SortOrder
+ *   Whether to sort in ascending or descending order
  *
  * @tparam KeyT
  *   Key type
@@ -855,7 +862,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_b
  *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
-template <bool IS_DESCENDING,
+template <SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
@@ -1303,7 +1310,7 @@ struct DispatchRadixSort
       constexpr int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS;
       int histo_blocks_per_sm           = 1;
       auto histogram_kernel =
-        detail::radix_sort::DeviceRadixSortHistogramKernel<max_policy_t, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
+        detail::radix_sort::DeviceRadixSortHistogramKernel<max_policy_t, Order, KeyT, OffsetT, DecomposerT>;
 
       error = CubDebug(
         cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0));
@@ -1406,7 +1413,7 @@ struct DispatchRadixSort
 
           auto onesweep_kernel = detail::radix_sort::DeviceRadixSortOnesweepKernel<
             max_policy_t,
-            IS_DESCENDING,
+            Order,
             KeyT,
             ValueT,
             OffsetT,
@@ -1655,13 +1662,11 @@ struct DispatchRadixSort
   {
     // Invoke upsweep-downsweep
     return InvokePasses<ActivePolicyT>(
-      detail::radix_sort::DeviceRadixSortUpsweepKernel<max_policy_t, false, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
-      detail::radix_sort::DeviceRadixSortUpsweepKernel<max_policy_t, true, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
+      detail::radix_sort::DeviceRadixSortUpsweepKernel<max_policy_t, false, Order, KeyT, OffsetT, DecomposerT>,
+      detail::radix_sort::DeviceRadixSortUpsweepKernel<max_policy_t, true, Order, KeyT, OffsetT, DecomposerT>,
       detail::radix_sort::RadixSortScanBinsKernel<max_policy_t, OffsetT>,
-      detail::radix_sort::
-        DeviceRadixSortDownsweepKernel<max_policy_t, false, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>,
-      detail::radix_sort::
-        DeviceRadixSortDownsweepKernel<max_policy_t, true, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
+      detail::radix_sort::DeviceRadixSortDownsweepKernel<max_policy_t, false, Order, KeyT, ValueT, OffsetT, DecomposerT>,
+      detail::radix_sort::DeviceRadixSortDownsweepKernel<max_policy_t, true, Order, KeyT, ValueT, OffsetT, DecomposerT>);
   }
 
   template <typename ActivePolicyT>
@@ -1761,8 +1766,7 @@ struct DispatchRadixSort
     {
       // Small, single tile size
       return InvokeSingleTile<ActivePolicyT>(
-        detail::radix_sort::
-          DeviceRadixSortSingleTileKernel<max_policy_t, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
+        detail::radix_sort::DeviceRadixSortSingleTileKernel<max_policy_t, Order, KeyT, ValueT, OffsetT, DecomposerT>);
     }
     else
     {
@@ -1866,8 +1870,8 @@ struct DispatchRadixSort
  * @brief Utility class for dispatching the appropriately-tuned kernels for segmented device-wide
  * radix sort
  *
- * @tparam IS_DESCENDING
- *   Whether or not the sorted-order is high-to-low
+ * @tparam SortOrder
+ *   Whether to sort in ascending or descending order
  *
  * @tparam KeyT
  *   Key type
@@ -1884,7 +1888,7 @@ struct DispatchRadixSort
  * @tparam OffsetT
  *   Signed integer type for global offsets
  */
-template <bool IS_DESCENDING,
+template <SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename BeginOffsetIteratorT,
@@ -2237,7 +2241,7 @@ struct DispatchSegmentedRadixSort
       detail::radix_sort::DeviceSegmentedRadixSortKernel<
         max_policy_t,
         false,
-        IS_DESCENDING,
+        Order,
         KeyT,
         ValueT,
         BeginOffsetIteratorT,
@@ -2247,7 +2251,7 @@ struct DispatchSegmentedRadixSort
       detail::radix_sort::DeviceSegmentedRadixSortKernel<
         max_policy_t,
         true,
-        IS_DESCENDING,
+        Order,
         KeyT,
         ValueT,
         BeginOffsetIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 9d089cd5f6e..1a44ca57ccc 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -1,4 +1,3 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
@@ -63,6 +62,12 @@
 
 CUB_NAMESPACE_BEGIN
 
+enum class ForceInclusive
+{
+  Yes,
+  No
+};
+
 namespace detail::scan
 {
 
@@ -73,7 +78,7 @@ template <typename MaxPolicyT,
           typename InitValueT,
           typename OffsetT,
           typename AccumT,
-          bool ForceInclusive>
+          ForceInclusive EnforceInclusive>
 struct DeviceScanKernelSource
 {
   using ScanTileStateT = typename cub::ScanTileState<AccumT>;
@@ -90,7 +95,7 @@ struct DeviceScanKernelSource
                      InitValueT,
                      OffsetT,
                      AccumT,
-                     ForceInclusive>)
+                     EnforceInclusive == ForceInclusive::Yes>)
 
   CUB_RUNTIME_FUNCTION static constexpr std::size_t AccumSize()
   {
@@ -124,8 +129,8 @@ struct DeviceScanKernelSource
  * @tparam OffsetT
  *   Unsigned integer type for global offsets
  *
- * @tparam ForceInclusive
- *   Boolean flag to force InclusiveScan invocation when true.
+ * @tparam EnforceInclusive
+ *   Enum flag to specify whether to enforce inclusive scan.
  *
  */
 template <typename InputIteratorT,
@@ -133,22 +138,22 @@ template <typename InputIteratorT,
           typename ScanOpT,
           typename InitValueT,
           typename OffsetT,
-          typename AccumT       = ::cuda::std::__accumulator_t<ScanOpT,
-                                                               cub::detail::value_t<InputIteratorT>,
-                                                               ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
-                                                                                cub::detail::value_t<InputIteratorT>,
-                                                                                typename InitValueT::value_type>>,
-          typename PolicyHub    = detail::scan::policy_hub<AccumT, ScanOpT>,
-          bool ForceInclusive   = false,
-          typename KernelSource = detail::scan::DeviceScanKernelSource<
-            typename PolicyHub::MaxPolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            ScanOpT,
-            InitValueT,
-            OffsetT,
-            AccumT,
-            ForceInclusive>,
+          typename AccumT                 = ::cuda::std::__accumulator_t<ScanOpT,
+                                                                         cub::detail::value_t<InputIteratorT>,
+                                                                         ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
+                                                                                          cub::detail::value_t<InputIteratorT>,
+                                                                                          typename InitValueT::value_type>>,
+          typename PolicyHub              = detail::scan::policy_hub<AccumT, ScanOpT>,
+          ForceInclusive EnforceInclusive = ForceInclusive::No,
+          typename KernelSource           = detail::scan::DeviceScanKernelSource<
+                      typename PolicyHub::MaxPolicy,
+                      InputIteratorT,
+                      OutputIteratorT,
+                      ScanOpT,
+                      InitValueT,
+                      OffsetT,
+                      AccumT,
+                      EnforceInclusive>,
           typename KernelLauncherFactory = detail::TripleChevronFactory>
 
 struct DispatchScan
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index e445ebcb5ad..f742914e5b4 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -148,7 +148,7 @@ _CCCL_HOST_DEVICE OffsetIteratorT<Iterator, OffsetItT> make_offset_iterator(cons
  *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
  *   considered empty.
  */
-template <bool IS_DESCENDING,
+template <SortOrder Order,
           typename ChainedPolicyT,
           typename KeyT,
           typename ValueT,
@@ -181,12 +181,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   }
 
   using AgentSegmentedRadixSortT =
-    radix_sort::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
+    radix_sort::AgentSegmentedRadixSort<Order == SortOrder::Descending, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
 
   using WarpReduceT = cub::WarpReduce<KeyT>;
 
   using AgentWarpMergeSortT =
-    sub_warp_merge_sort::AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+    sub_warp_merge_sort::AgentSubWarpSort<Order == SortOrder::Descending, MediumPolicyT, KeyT, ValueT, OffsetT>;
 
   __shared__ union
   {
@@ -325,7 +325,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
  *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
  *   considered empty.
  */
-template <bool IS_DESCENDING,
+template <SortOrder Order,
           typename ChainedPolicyT,
           typename KeyT,
           typename ValueT,
@@ -360,10 +360,10 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
   constexpr auto threads_per_small_segment  = static_cast<local_segment_index_t>(SmallPolicyT::WARP_THREADS);
 
   using MediumAgentWarpMergeSortT =
-    sub_warp_merge_sort::AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+    sub_warp_merge_sort::AgentSubWarpSort<Order == SortOrder::Descending, MediumPolicyT, KeyT, ValueT, OffsetT>;
 
   using SmallAgentWarpMergeSortT =
-    sub_warp_merge_sort::AgentSubWarpSort<IS_DESCENDING, SmallPolicyT, KeyT, ValueT, OffsetT>;
+    sub_warp_merge_sort::AgentSubWarpSort<Order == SortOrder::Descending, SmallPolicyT, KeyT, ValueT, OffsetT>;
 
   constexpr auto segments_per_medium_block =
     static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
@@ -449,7 +449,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
  *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
  *   considered empty.
  */
-template <bool IS_DESCENDING,
+template <SortOrder Order,
           typename ChainedPolicyT,
           typename KeyT,
           typename ValueT,
@@ -475,7 +475,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
   constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD;
 
   using AgentSegmentedRadixSortT =
-    radix_sort::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
+    radix_sort::AgentSegmentedRadixSort<Order == SortOrder::Descending, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
 
   __shared__ typename AgentSegmentedRadixSortT::TempStorage storage;
 
@@ -748,7 +748,7 @@ __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContin
 
 } // namespace detail::segmented_sort
 
-template <bool IS_DESCENDING,
+template <SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename OffsetT,
@@ -1078,7 +1078,7 @@ struct DispatchSegmentedSort
         // kernels for each of them.
         error = SortWithPartitioning<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
           detail::segmented_sort::DeviceSegmentedSortKernelLarge<
-            IS_DESCENDING,
+            Order,
             MaxPolicyT,
             KeyT,
             ValueT,
@@ -1086,7 +1086,7 @@ struct DispatchSegmentedSort
             StreamingEndOffsetIteratorT,
             OffsetT>,
           detail::segmented_sort::DeviceSegmentedSortKernelSmall<
-            IS_DESCENDING,
+            Order,
             MaxPolicyT,
             KeyT,
             ValueT,
@@ -1110,7 +1110,7 @@ struct DispatchSegmentedSort
 
         error = SortWithoutPartitioning<LargeSegmentPolicyT>(
           detail::segmented_sort::DeviceSegmentedSortFallbackKernel<
-            IS_DESCENDING,
+            Order,
             MaxPolicyT,
             KeyT,
             ValueT,
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index c6f2acc993a..a0f37e796b1 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -45,6 +45,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_select_if.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_select_if.cuh>
 #include <cub/grid/grid_queue.cuh>
@@ -191,7 +192,7 @@ public:
 /**
  * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`.
  */
-template <bool KeepRejects, bool MayAlias>
+template <SelectImpl SelectionOpt>
 struct agent_select_if_wrapper_t
 {
   // Using an explicit list of template parameters forwarded to AgentSelectIf, since MSVC complains about a template
@@ -213,8 +214,7 @@ struct agent_select_if_wrapper_t
                              EqualityOpT,
                              OffsetT,
                              StreamingContextT,
-                             KeepRejects,
-                             MayAlias>
+                             SelectionOpt>
   {
     using AgentSelectIf<AgentSelectIfPolicyT,
                         InputIteratorT,
@@ -224,8 +224,7 @@ struct agent_select_if_wrapper_t
                         EqualityOpT,
                         OffsetT,
                         StreamingContextT,
-                        KeepRejects,
-                        MayAlias>::AgentSelectIf;
+                        SelectionOpt>::AgentSelectIf;
   };
 };
 
@@ -321,12 +320,11 @@ template <typename ChainedPolicyT,
           typename EqualityOpT,
           typename OffsetT,
           typename StreamingContextT,
-          bool KeepRejects,
-          bool MayAlias>
+          SelectImpl SelectionOpt>
 __launch_bounds__(int(
   vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    agent_select_if_wrapper_t<KeepRejects, MayAlias>::template agent_t,
+    agent_select_if_wrapper_t<SelectionOpt>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
@@ -349,7 +347,7 @@ __launch_bounds__(int(
 {
   using VsmemHelperT = vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    agent_select_if_wrapper_t<KeepRejects, MayAlias>::template agent_t,
+    agent_select_if_wrapper_t<SelectionOpt>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
@@ -409,8 +407,9 @@ __launch_bounds__(int(
  * @tparam OffsetT
  *   Signed integer type for global offsets
  *
- * @tparam KeepRejects
- *   Whether or not we push rejected items to the back of the output
+ * @tparam SelectImpl SelectionOpt
+ *   SelectImpl indicating whether to partition, just selection or selection where the memory for the input and
+ *   output may alias each other.
  */
 template <typename InputIteratorT,
           typename FlagsInputIteratorT,
@@ -419,13 +418,12 @@ template <typename InputIteratorT,
           typename SelectOpT,
           typename EqualityOpT,
           typename OffsetT,
-          bool KeepRejects,
-          bool MayAlias      = false,
+          SelectImpl SelectionOpt,
           typename PolicyHub = detail::select::policy_hub<cub::detail::value_t<InputIteratorT>,
                                                           cub::detail::value_t<FlagsInputIteratorT>,
                                                           detail::select::per_partition_offset_t,
-                                                          MayAlias,
-                                                          KeepRejects>>
+                                                          (SelectionOpt == SelectImpl::SelectPotentiallyInPlace),
+                                                          (SelectionOpt == SelectImpl::Partition)>>
 struct DispatchSelectIf
 {
   /******************************************************************************
@@ -561,7 +559,7 @@ struct DispatchSelectIf
 
     using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
       Policy,
-      detail::select::agent_select_if_wrapper_t<KeepRejects, MayAlias>::template agent_t,
+      detail::select::agent_select_if_wrapper_t<SelectionOpt>::template agent_t,
       InputIteratorT,
       FlagsInputIteratorT,
       SelectedOutputIteratorT,
@@ -764,8 +762,7 @@ struct DispatchSelectIf
         EqualityOpT,
         per_partition_offset_t,
         streaming_context_t,
-        KeepRejects,
-        MayAlias>);
+        SelectionOpt>);
   }
 
   /**
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index 5e796e14e2d..b81eb164c37 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -49,6 +49,13 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::transform
 {
+
+enum class requires_stable_address
+{
+  no,
+  yes
+};
+
 template <typename T>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
 {
@@ -608,21 +615,21 @@ struct prefetch_config
   int sm_count;
 };
 
-template <bool RequiresStableAddress,
+template <requires_stable_address StableAddress,
           typename Offset,
           typename RandomAccessIteratorTupleIn,
           typename RandomAccessIteratorOut,
           typename TransformOp,
-          typename PolicyHub = policy_hub<RequiresStableAddress, RandomAccessIteratorTupleIn>>
+          typename PolicyHub = policy_hub<StableAddress == requires_stable_address::yes, RandomAccessIteratorTupleIn>>
 struct dispatch_t;
 
-template <bool RequiresStableAddress,
+template <requires_stable_address StableAddress,
           typename Offset,
           typename... RandomAccessIteratorsIn,
           typename RandomAccessIteratorOut,
           typename TransformOp,
           typename PolicyHub>
-struct dispatch_t<RequiresStableAddress,
+struct dispatch_t<StableAddress,
                   Offset,
                   ::cuda::std::tuple<RandomAccessIteratorsIn...>,
                   RandomAccessIteratorOut,
diff --git a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
index 224caf9a43b..ba2538a4976 100644
--- a/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
+++ b/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
@@ -50,7 +50,7 @@
 // TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is
 // addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
 // types
-template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
+template <cub::SortOrder Order, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper(
   void* d_temp_storage,
   size_t& temp_storage_bytes,
@@ -69,7 +69,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_rad
   cub::DoubleBuffer<cub::NullType> d_values;
   cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
   auto status = cub::DispatchSegmentedRadixSort<
-    IS_DESCENDING,
+    Order,
     KeyT,
     cub::NullType,
     BeginOffsetIteratorT,
@@ -103,8 +103,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_rad
 
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending);
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<true>, dispatch_segmented_radix_sort_descending);
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<false>, dispatch_segmented_radix_sort);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<cub::SortOrder::Descending>,
+                       dispatch_segmented_radix_sort_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<cub::SortOrder::Ascending>, dispatch_segmented_radix_sort);
 
 // TODO:
 // - int128
diff --git a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
index a5b7529f3d1..1c88eb69452 100644
--- a/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
@@ -46,7 +46,7 @@
 // TODO replace with DeviceSegmentedRadixSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is
 // addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
 // types
-template <bool IS_DESCENDING,
+template <cub::SortOrder Order,
           typename KeyT,
           typename ValueT,
           typename NumItemsT,
@@ -72,7 +72,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_rad
   cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
   cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
   auto status = cub::DispatchSegmentedRadixSort<
-    IS_DESCENDING,
+    Order,
     KeyT,
     ValueT,
     BeginOffsetIteratorT,
@@ -105,9 +105,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_rad
 
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairs, sort_pairs);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairsDescending, sort_pairs_descending);
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<true>,
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<cub::SortOrder::Descending>,
                        dispatch_segmented_radix_sort_pairs_descending);
-DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<false>, dispatch_segmented_radix_sort_pairs);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<cub::SortOrder::Ascending>,
+                       dispatch_segmented_radix_sort_pairs);
 
 using custom_value_t = c2h::custom_type_t<c2h::equal_comparable_t>;
 using value_types    = c2h::type_list<cuda::std::uint8_t, cuda::std::uint64_t, custom_value_t>;
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
index 95c4794b8cf..1bb00efd1ad 100644
--- a/cub/test/catch2_test_device_transform.cu
+++ b/cub/test/catch2_test_device_transform.cu
@@ -63,8 +63,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point(
     return cudaSuccess;
   }
 
-  constexpr bool RequiresStableAddress = false;
-  return cub::detail::transform::dispatch_t<RequiresStableAddress,
+  return cub::detail::transform::dispatch_t<cub::detail::transform::requires_stable_address::no,
                                             Offset,
                                             ::cuda::std::tuple<RandomAccessIteratorsIn...>,
                                             RandomAccessIteratorOut,
diff --git a/thrust/thrust/system/cuda/detail/adjacent_difference.h b/thrust/thrust/system/cuda/detail/adjacent_difference.h
index 3372b9eaf25..c62aadfc14d 100644
--- a/thrust/thrust/system/cuda/detail/adjacent_difference.h
+++ b/thrust/thrust/system/cuda/detail/adjacent_difference.h
@@ -73,7 +73,7 @@ namespace cuda_cub
 namespace __adjacent_difference
 {
 
-template <bool MayAlias, class InputIt, class OutputIt, class BinaryOp>
+template <cub::MayAlias AliasOpt, class InputIt, class OutputIt, class BinaryOp>
 cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   void* d_temp_storage,
   size_t& temp_storage_bytes,
@@ -88,11 +88,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
     return cudaSuccess;
   }
 
-  constexpr bool may_alias = MayAlias;
-  constexpr bool read_left = true;
+  constexpr cub::ReadOption read_left = cub::ReadOption::Left;
 
-  using Dispatch32 = cub::DispatchAdjacentDifference<InputIt, OutputIt, BinaryOp, std::int32_t, may_alias, read_left>;
-  using Dispatch64 = cub::DispatchAdjacentDifference<InputIt, OutputIt, BinaryOp, std::int64_t, may_alias, read_left>;
+  using Dispatch32 = cub::DispatchAdjacentDifference<InputIt, OutputIt, BinaryOp, std::int32_t, AliasOpt, read_left>;
+  using Dispatch64 = cub::DispatchAdjacentDifference<InputIt, OutputIt, BinaryOp, std::int64_t, AliasOpt, read_left>;
 
   cudaError_t status;
   THRUST_INDEX_TYPE_DISPATCH2(
@@ -115,8 +114,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   cudaStream_t stream,
   thrust::detail::integral_constant<bool, false> /* comparable */)
 {
-  constexpr bool may_alias = true;
-  return doit_step<may_alias>(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream);
+  return doit_step<cub::MayAlias::Yes>(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream);
 }
 
 template <class InputIt, class OutputIt, class BinaryOp>
@@ -135,12 +133,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   // `num_items`. In the latter case, we use an optimized version.
   if (first != result)
   {
-    constexpr bool may_alias = false;
-    return doit_step<may_alias>(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream);
+    return doit_step<cub::MayAlias::No>(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream);
   }
 
-  constexpr bool may_alias = true;
-  return doit_step<may_alias>(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream);
+  return doit_step<cub::MayAlias::Yes>(d_temp_storage, temp_storage_bytes, first, result, binary_op, num_items, stream);
 }
 
 template <typename Derived, typename InputIt, typename OutputIt, typename BinaryOp>
diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
index a8fa4dd9faf..f723a1e31c0 100644
--- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -136,7 +136,6 @@ unique_eager_event async_inclusive_scan_n(
   using InputValueT = cub::detail::InputValue<InitialValueType>;
   using AccumT      = typename ::cuda::std::
     __accumulator_t<BinaryOp, typename ::cuda::std::iterator_traits<ForwardIt>::value_type, InitialValueType>;
-  constexpr bool ForceInclusive = true;
 
   using Dispatch32 =
     cub::DispatchScan<ForwardIt,
@@ -146,7 +145,7 @@ unique_eager_event async_inclusive_scan_n(
                       std::int32_t,
                       AccumT,
                       cub::detail::scan::policy_hub<AccumT, BinaryOp>,
-                      ForceInclusive>;
+                      cub::ForceInclusive::Yes>;
   using Dispatch64 =
     cub::DispatchScan<ForwardIt,
                       OutputIt,
@@ -155,7 +154,7 @@ unique_eager_event async_inclusive_scan_n(
                       std::int64_t,
                       AccumT,
                       cub::detail::scan::policy_hub<AccumT, BinaryOp>,
-                      ForceInclusive>;
+                      cub::ForceInclusive::Yes>;
 
   InputValueT init_value(init);
 
diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
index a0e5f6e254d..4b1577f6bb6 100644
--- a/thrust/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -87,16 +87,7 @@ namespace cuda_cub
 namespace detail
 {
 
-/**
- * Enum class to indicate whether the memory of input and output iterators potentially alias one another.
- */
-enum class InputMayAliasOutput
-{
-  no,
-  yes
-};
-
-template <InputMayAliasOutput MayAlias,
+template <cub::SelectImpl SelectionOpt,
           typename Derived,
           typename InputIt,
           typename StencilIt,
@@ -124,30 +115,20 @@ struct DispatchCopyIf
     std::size_t allocation_sizes[2] = {0, sizeof(OffsetT)};
     void* allocations[2]            = {nullptr, nullptr};
 
-    // drop rejected items (i.e., this is not a partition, but a selection)
-    constexpr bool keep_rejects = false;
-    constexpr bool may_alias    = (MayAlias == InputMayAliasOutput::yes);
-
     // Query algorithm memory requirements
-    status = cub::DispatchSelectIf<
-      InputIt,
-      StencilIt,
-      OutputIt,
-      num_selected_out_it_t,
-      Predicate,
-      equality_op_t,
-      OffsetT,
-      keep_rejects,
-      may_alias>::Dispatch(nullptr,
-                           allocation_sizes[0],
-                           first,
-                           stencil,
-                           output,
-                           static_cast<num_selected_out_it_t>(nullptr),
-                           predicate,
-                           equality_op_t{},
-                           num_items,
-                           stream);
+    status = cub::
+      DispatchSelectIf<InputIt, StencilIt, OutputIt, num_selected_out_it_t, Predicate, equality_op_t, OffsetT, SelectionOpt>::
+        Dispatch(
+          nullptr,
+          allocation_sizes[0],
+          first,
+          stencil,
+          output,
+          static_cast<num_selected_out_it_t>(nullptr),
+          predicate,
+          equality_op_t{},
+          num_items,
+          stream);
     CUDA_CUB_RET_IF_FAIL(status);
 
     status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
@@ -169,25 +150,19 @@ struct DispatchCopyIf
     OffsetT* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<OffsetT*>(allocations[1]);
 
     // Run algorithm
-    status = cub::DispatchSelectIf<
-      InputIt,
-      StencilIt,
-      OutputIt,
-      num_selected_out_it_t,
-      Predicate,
-      equality_op_t,
-      OffsetT,
-      keep_rejects,
-      may_alias>::Dispatch(allocations[0],
-                           allocation_sizes[0],
-                           first,
-                           stencil,
-                           output,
-                           d_num_selected_out,
-                           predicate,
-                           equality_op_t{},
-                           num_items,
-                           stream);
+    status = cub::
+      DispatchSelectIf<InputIt, StencilIt, OutputIt, num_selected_out_it_t, Predicate, equality_op_t, OffsetT, SelectionOpt>::
+        Dispatch(
+          allocations[0],
+          allocation_sizes[0],
+          first,
+          stencil,
+          output,
+          d_num_selected_out,
+          predicate,
+          equality_op_t{},
+          num_items,
+          stream);
     CUDA_CUB_RET_IF_FAIL(status);
 
     // Get number of selected items
@@ -199,7 +174,7 @@ struct DispatchCopyIf
   }
 };
 
-template <InputMayAliasOutput MayAlias,
+template <cub::SelectImpl SelectionOpt,
           typename Derived,
           typename InputIt,
           typename StencilIt,
@@ -224,7 +199,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
   // inputs larger than INT_MAX into partitions of up to `INT_MAX` items each, repeatedly invoking the respective
   // algorithm. With that approach, we can always use i64 offset types for DispatchSelectIf, because there's only very
   // limited performance upside for using i32 offset types. This avoids potentially duplicate kernel compilation.
-  using dispatch64_t = DispatchCopyIf<MayAlias, Derived, InputIt, StencilIt, OutputIt, Predicate, std::int64_t>;
+  using dispatch64_t = DispatchCopyIf<SelectionOpt, Derived, InputIt, StencilIt, OutputIt, Predicate, std::int64_t>;
 
   // Query temporary storage requirements
   status = dispatch64_t::dispatch(
@@ -253,7 +228,7 @@ template <class Derived, class InputIterator, class OutputIterator, class Predic
 OutputIterator _CCCL_HOST_DEVICE copy_if(
   execution_policy<Derived>& policy, InputIterator first, InputIterator last, OutputIterator result, Predicate pred)
 {
-  THRUST_CDP_DISPATCH((return detail::copy_if<detail::InputMayAliasOutput::no>(
+  THRUST_CDP_DISPATCH((return detail::copy_if<cub::SelectImpl::Select>(
                                 policy, first, last, static_cast<cub::NullType*>(nullptr), result, pred);),
                       (return thrust::copy_if(cvt_to_seq(derived_cast(policy)), first, last, result, pred);));
 }
@@ -268,9 +243,8 @@ OutputIterator _CCCL_HOST_DEVICE copy_if(
   OutputIterator result,
   Predicate pred)
 {
-  THRUST_CDP_DISPATCH(
-    (return detail::copy_if<detail::InputMayAliasOutput::no>(policy, first, last, stencil, result, pred);),
-    (return thrust::copy_if(cvt_to_seq(derived_cast(policy)), first, last, stencil, result, pred);));
+  THRUST_CDP_DISPATCH((return detail::copy_if<cub::SelectImpl::Select>(policy, first, last, stencil, result, pred);),
+                      (return thrust::copy_if(cvt_to_seq(derived_cast(policy)), first, last, stencil, result, pred);));
 }
 
 } // namespace cuda_cub
diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h
index 152381af039..36f69ad7d6e 100644
--- a/thrust/thrust/system/cuda/detail/partition.h
+++ b/thrust/thrust/system/cuda/detail/partition.h
@@ -87,10 +87,6 @@ struct DispatchPartitionIf
     std::size_t allocation_sizes[2] = {0, sizeof(OffsetT)};
     void* allocations[2]            = {nullptr, nullptr};
 
-    // Partitioning algorithm keeps "rejected" items
-    constexpr bool keep_rejects = true;
-    constexpr bool may_alias    = false;
-
     // Query algorithm memory requirements
     status = cub::DispatchSelectIf<
       InputIt,
@@ -100,17 +96,16 @@ struct DispatchPartitionIf
       Predicate,
       equality_op_t,
       OffsetT,
-      keep_rejects,
-      may_alias>::Dispatch(nullptr,
-                           allocation_sizes[0],
-                           first,
-                           stencil,
-                           output,
-                           static_cast<num_selected_out_it_t>(nullptr),
-                           predicate,
-                           equality_op_t{},
-                           num_items,
-                           stream);
+      cub::SelectImpl::Partition>::Dispatch(nullptr,
+                                            allocation_sizes[0],
+                                            first,
+                                            stencil,
+                                            output,
+                                            static_cast<num_selected_out_it_t>(nullptr),
+                                            predicate,
+                                            equality_op_t{},
+                                            num_items,
+                                            stream);
     CUDA_CUB_RET_IF_FAIL(status);
 
     status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
@@ -141,17 +136,16 @@ struct DispatchPartitionIf
       Predicate,
       equality_op_t,
       OffsetT,
-      keep_rejects,
-      may_alias>::Dispatch(allocations[0],
-                           allocation_sizes[0],
-                           first,
-                           stencil,
-                           output,
-                           d_num_selected_out,
-                           predicate,
-                           equality_op_t{},
-                           num_items,
-                           stream);
+      cub::SelectImpl::Partition>::Dispatch(allocations[0],
+                                            allocation_sizes[0],
+                                            first,
+                                            stencil,
+                                            output,
+                                            d_num_selected_out,
+                                            predicate,
+                                            equality_op_t{},
+                                            num_items,
+                                            stream);
     CUDA_CUB_RET_IF_FAIL(status);
 
     // Get number of selected items
diff --git a/thrust/thrust/system/cuda/detail/remove.h b/thrust/thrust/system/cuda/detail/remove.h
index 99340801941..76d23eeb81d 100644
--- a/thrust/thrust/system/cuda/detail/remove.h
+++ b/thrust/thrust/system/cuda/detail/remove.h
@@ -50,7 +50,7 @@ template <class Derived, class InputIt, class StencilIt, class Predicate>
 InputIt _CCCL_HOST_DEVICE
 remove_if(execution_policy<Derived>& policy, InputIt first, InputIt last, StencilIt stencil, Predicate predicate)
 {
-  THRUST_CDP_DISPATCH((return cuda_cub::detail::copy_if<cuda_cub::detail::InputMayAliasOutput::yes>(
+  THRUST_CDP_DISPATCH((return cuda_cub::detail::copy_if<cub::SelectImpl::SelectPotentiallyInPlace>(
                                 policy, first, last, stencil, first, thrust::not_fn(predicate));),
                       (return thrust::remove_if(cvt_to_seq(derived_cast(policy)), first, last, stencil, predicate);));
 }
@@ -60,7 +60,7 @@ template <class Derived, class InputIt, class Predicate>
 InputIt _CCCL_HOST_DEVICE remove_if(execution_policy<Derived>& policy, InputIt first, InputIt last, Predicate predicate)
 {
   THRUST_CDP_DISPATCH(
-    (return cuda_cub::detail::copy_if<cuda_cub::detail::InputMayAliasOutput::yes>(
+    (return cuda_cub::detail::copy_if<cub::SelectImpl::SelectPotentiallyInPlace>(
               policy, first, last, static_cast<cub::NullType*>(nullptr), first, thrust::not_fn(predicate));),
     (return thrust::remove_if(cvt_to_seq(derived_cast(policy)), first, last, predicate);));
 }
diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h
index 890e02ba198..ff7c0111343 100644
--- a/thrust/thrust/system/cuda/detail/scan.h
+++ b/thrust/thrust/system/cuda/detail/scan.h
@@ -121,7 +121,6 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
 {
   using InputValueT = cub::detail::InputValue<InitValueT>;
   using AccumT      = typename ::cuda::std::__accumulator_t<ScanOp, cub::detail::value_t<InputIt>, InitValueT>;
-  constexpr bool ForceInclusive = true;
 
   using Dispatch32 =
     cub::DispatchScan<InputIt,
@@ -131,7 +130,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
                       std::uint32_t,
                       AccumT,
                       cub::detail::scan::policy_hub<AccumT, ScanOp>,
-                      ForceInclusive>;
+                      cub::ForceInclusive::Yes>;
   using Dispatch64 =
     cub::DispatchScan<InputIt,
                       OutputIt,
@@ -140,7 +139,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
                       std::uint64_t,
                       AccumT,
                       cub::detail::scan::policy_hub<AccumT, ScanOp>,
-                      ForceInclusive>;
+                      cub::ForceInclusive::Yes>;
 
   cudaStream_t stream = thrust::cuda_cub::stream(policy);
   cudaError_t status;
diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
index 64154a8f6da..2baf4810550 100644
--- a/thrust/thrust/system/cuda/detail/transform.h
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -255,16 +255,17 @@ OutputIt THRUST_FUNCTION cub_transform_many(
     return result;
   }
 
-  constexpr auto requires_stable_address = !::cuda::proclaims_copyable_arguments<TransformOp>::value;
+  constexpr auto stable_address =
+    (::cuda::proclaims_copyable_arguments<TransformOp>::value)
+      ? cub::detail::transform::requires_stable_address::no
+      : cub::detail::transform::requires_stable_address::yes;
 
   cudaError_t status;
   THRUST_INDEX_TYPE_DISPATCH(
     status,
-    (cub::detail::transform::dispatch_t<requires_stable_address,
-                                        decltype(num_items_fixed),
-                                        ::cuda::std::tuple<InputIts...>,
-                                        OutputIt,
-                                        TransformOp>::dispatch),
+    (cub::detail::transform::
+       dispatch_t<stable_address, decltype(num_items_fixed), ::cuda::std::tuple<InputIts...>, OutputIt, TransformOp>::
+         dispatch),
     num_items,
     (firsts, result, num_items_fixed, transform_op, cuda_cub::stream(policy)));
   throw_on_error(status, "transform: failed inside CUB");