NVIDIA · elstehle · Feb 4, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
@@ -52,20 +52,28 @@ struct policy_hub_t
 template <class T, class OffsetT>
 void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
-  constexpr bool may_alias = false;
-  constexpr bool read_left = true;
-
   using input_it_t      = const T*;
   using output_it_t     = T*;
   using difference_op_t = ::cuda::std::minus<>;
   using offset_t        = cub::detail::choose_offset_t<OffsetT>;
 
 #if !TUNE_BASE
-  using dispatch_t = cub::
-    DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left, policy_hub_t>;
+  using dispatch_t = cub::DispatchAdjacentDifference<
+    input_it_t,
+    output_it_t,
+    difference_op_t,
+    offset_t,
+    cub::MayAlias::No,
+    cub::ReadOption::Left,
+    policy_hub_t>;
 #else
-  using dispatch_t =
-    cub::DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left>;
+  using dispatch_t = cub::DispatchAdjacentDifference<
+    input_it_t,
+    output_it_t,
+    difference_op_t,
+    offset_t,
+    cub::MayAlias::No,
+    cub::ReadOption::Left>;
 #endif // TUNE_BASE
 
   const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));

@@ -25,7 +25,7 @@
  *
  ******************************************************************************/
 
-#include <cub/device/device_copy.cuh>
+#include <cub/device/device_memcpy.cuh>
 
 // %RANGE% TUNE_THREADS tpb 128:1024:32
 // %RANGE% TUNE_BUFFERS_PER_THREAD bpt 1:18:1
@@ -184,8 +184,6 @@ void copy(nvbench::state& state,
   using buffer_offset_t    = std::uint32_t;
   using block_offset_t     = std::uint32_t;
 
-  constexpr bool is_memcpy = true;
-
 #if !TUNE_BASE
   using policy_t = policy_hub_t;
 #else
@@ -199,7 +197,7 @@ void copy(nvbench::state& state,
     buffer_offset_t,
     block_offset_t,
     policy_t,
-    is_memcpy>;
+    cub::CopyAlg::Memcpy>;
 
   thrust::device_vector<T> input_buffer = generate(elements);
   thrust::device_vector<T> output_buffer(elements);

@@ -42,9 +42,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = true;
-constexpr bool may_alias    = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -119,8 +116,7 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias
+    cub::SelectImpl::Partition
 #if !TUNE_BASE
     ,
     policy_hub_t<T>

@@ -42,9 +42,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = true;
-constexpr bool may_alias    = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -145,8 +142,7 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias
+    cub::SelectImpl::Partition
 #if !TUNE_BASE
     ,
     policy_hub_t<T>

@@ -41,8 +41,8 @@
 
 using value_t = cub::NullType;
 
-constexpr bool is_descending   = false;
-constexpr bool is_overwrite_ok = false;
+constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
+constexpr bool is_overwrite_ok      = false;
 
 #if !TUNE_BASE
 template <typename KeyT, typename ValueT, typename OffsetT>
@@ -105,10 +105,10 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using portion_offset  = int;
   using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
   using agent_radix_sort_onesweep_t =
-    cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
+    cub::AgentRadixSortOnesweep<onesweep_policy, sort_order, KeyT, ValueT, OffsetT, portion_offset>;
 
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
-  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
+  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, sort_order, KeyT, OffsetT>;
 
   return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
                             sizeof(typename hist_agent::TempStorage));
@@ -144,9 +144,9 @@ void radix_sort_keys(std::integral_constant<bool, true>, nvbench::state& state,
   using key_t = T;
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<key_t, value_t, offset_t>;
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t, policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t>;
 #endif // TUNE_BASE
 
   constexpr int begin_bit = 0;
@@ -207,7 +207,7 @@ void radix_sort_keys(std::integral_constant<bool, true>, nvbench::state& state,
 template <typename T, typename OffsetT>
 void radix_sort_keys(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<T, OffsetT>)
 {
-  (void) is_descending;
+  (void) sort_order;
   (void) is_overwrite_ok;
 }
 

@@ -39,8 +39,8 @@
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 
-constexpr bool is_descending   = false;
-constexpr bool is_overwrite_ok = false;
+constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
+constexpr bool is_overwrite_ok      = false;
 
 #if !TUNE_BASE
 template <typename KeyT, typename ValueT, typename OffsetT>
@@ -103,10 +103,10 @@ constexpr std::size_t max_onesweep_temp_storage_size()
   using portion_offset  = int;
   using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
   using agent_radix_sort_onesweep_t =
-    cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
+    cub::AgentRadixSortOnesweep<onesweep_policy, sort_order, KeyT, ValueT, OffsetT, portion_offset>;
 
   using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
-  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
+  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, sort_order, KeyT, OffsetT>;
 
   return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
                             sizeof(typename hist_agent::TempStorage));
@@ -144,9 +144,9 @@ void radix_sort_values(
   using value_t = ValueT;
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<key_t, value_t, offset_t>;
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t, policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
+  using dispatch_t = cub::DispatchRadixSort<sort_order, key_t, value_t, offset_t>;
 #endif // TUNE_BASE
 
   constexpr int begin_bit = 0;
@@ -212,7 +212,7 @@ void radix_sort_values(
 template <typename KeyT, typename ValueT, typename OffsetT>
 void radix_sort_values(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<KeyT, ValueT, OffsetT>)
 {
-  (void) is_descending;
+  (void) sort_order;
   (void) is_overwrite_ok;
 }
 

@@ -153,8 +153,8 @@ void seg_sort(nvbench::state& state,
               const thrust::device_vector<OffsetT>& offsets,
               bit_entropy entropy)
 {
-  constexpr bool is_descending   = false;
-  constexpr bool is_overwrite_ok = false;
+  constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
+  constexpr bool is_overwrite_ok      = false;
 
   using offset_t          = OffsetT;
   using begin_offset_it_t = const offset_t*;
@@ -165,10 +165,10 @@ void seg_sort(nvbench::state& state,
 #if !TUNE_BASE
   using policy_t   = device_seg_sort_policy_hub<key_t>;
   using dispatch_t = //
-    cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t, policy_t>;
+    cub::DispatchSegmentedSort<sort_order, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t, policy_t>;
 #else
   using dispatch_t = //
-    cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t>;
+    cub::DispatchSegmentedSort<sort_order, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t>;
 #endif
 
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));

@@ -40,8 +40,6 @@
 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
-constexpr bool keep_rejects = false;
-
 #if !TUNE_BASE
 #  if TUNE_TRANSPOSE == 0
 #    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
@@ -78,17 +76,18 @@ struct policy_hub_t
 };
 #endif // !TUNE_BASE
 
-template <typename T, typename OffsetT, typename MayAlias>
-void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
+template <typename T, typename OffsetT, typename InPlace>
+void select(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlace>)
 {
-  using input_it_t         = const T*;
-  using flag_it_t          = const bool*;
-  using output_it_t        = T*;
-  using num_selected_it_t  = OffsetT*;
-  using select_op_t        = cub::NullType;
-  using equality_op_t      = cub::NullType;
-  using offset_t           = OffsetT;
-  constexpr bool may_alias = MayAlias::value;
+  using input_it_t        = const T*;
+  using flag_it_t         = const bool*;
+  using output_it_t       = T*;
+  using num_selected_it_t = OffsetT*;
+  using select_op_t       = cub::NullType;
+  using equality_op_t     = cub::NullType;
+  using offset_t          = OffsetT;
+  constexpr cub::SelectImpl selection_option =
+    InPlace::value ? cub::SelectImpl::SelectPotentiallyInPlace : cub::SelectImpl::Select;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -100,20 +99,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
     select_op_t,
     equality_op_t,
     offset_t,
-    keep_rejects,
-    may_alias,
+    selection_option,
     policy_t>;
 #else // TUNE_BASE
-  using dispatch_t = cub::DispatchSelectIf<
-    input_it_t,
-    flag_it_t,
-    output_it_t,
-    num_selected_it_t,
-    select_op_t,
-    equality_op_t,
-    offset_t,
-    keep_rejects,
-    may_alias>;
+  using dispatch_t =
+    cub::DispatchSelectIf<input_it_t,
+                          flag_it_t,
+                          output_it_t,
+                          num_selected_it_t,
+                          select_op_t,
+                          equality_op_t,
+                          offset_t,
+                          selection_option>;
 #endif // !TUNE_BASE
 
   // Retrieve axis parameters
@@ -165,18 +162,18 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
 
 using ::cuda::std::false_type;
 using ::cuda::std::true_type;
-#ifdef TUNE_MayAlias
-using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
-#else // !defined(TUNE_MayAlias)
-using may_alias = nvbench::type_list<false_type, true_type>;
-#endif // TUNE_MayAlias
+#ifdef TUNE_InPlace
+using is_in_place = nvbench::type_list<TUNE_InPlace>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_InPlace)
+using is_in_place = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_InPlace
 
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;
 
-NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, may_alias))
+NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, select_offset_types, is_in_place))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "MayAlias{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "InPlace{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});