NVIDIA · bernhardmgruber · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
@@ -14,6 +14,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_merge.cuh>
+#include <cub/device/dispatch/tuning/tuning_merge.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 #include <cub/util_vsmem.cuh>
@@ -155,56 +156,6 @@ __launch_bounds__(
   vsmem_helper_t::discard_temp_storage(temp_storage);
 }
 
-template <typename KeyT, typename ValueT>
-struct device_merge_policy_hub
-{
-  static constexpr bool has_values = !::cuda::std::is_same<ValueT, NullType>::value;
-
-  using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)];
-
-  struct policy300 : ChainedPolicy<300, policy300, policy300>
-  {
-    using merge_policy =
-      agent_policy_t<128,
-                     Nominal4BItemsToItems<tune_type>(7),
-                     BLOCK_LOAD_WARP_TRANSPOSE,
-                     LOAD_DEFAULT,
-                     BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-
-  struct policy350 : ChainedPolicy<350, policy350, policy300>
-  {
-    using merge_policy =
-      agent_policy_t<256,
-                     Nominal4BItemsToItems<tune_type>(11),
-                     BLOCK_LOAD_WARP_TRANSPOSE,
-                     LOAD_LDG,
-                     BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-
-  struct policy520 : ChainedPolicy<520, policy520, policy350>
-  {
-    using merge_policy =
-      agent_policy_t<512,
-                     Nominal4BItemsToItems<tune_type>(13),
-                     BLOCK_LOAD_WARP_TRANSPOSE,
-                     LOAD_LDG,
-                     BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-
-  struct policy600 : ChainedPolicy<600, policy600, policy520>
-  {
-    using merge_policy =
-      agent_policy_t<512,
-                     Nominal4BItemsToItems<tune_type>(15),
-                     BLOCK_LOAD_WARP_TRANSPOSE,
-                     LOAD_DEFAULT,
-                     BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-
-  using max_policy = policy600;
-};
-
 template <typename KeyIt1,
           typename ValueIt1,
           typename KeyIt2,
@@ -213,7 +164,7 @@ template <typename KeyIt1,
           typename ValueIt3,
           typename Offset,
           typename CompareOp,
-          typename PolicyHub = device_merge_policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
+          typename PolicyHub = detail::merge::policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
 struct dispatch_t
 {
   void* d_temp_storage;

@@ -38,6 +38,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_merge_sort.cuh>
+#include <cub/device/dispatch/tuning/tuning_merge_sort.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
@@ -327,61 +328,13 @@ __launch_bounds__(
  * Policy
  ******************************************************************************/
 
-template <typename KeyIteratorT>
-struct DeviceMergeSortPolicy
-{
-  using KeyT = cub::detail::value_t<KeyIteratorT>;
-
-  //----------------------------------------------------------------------------
-  // Architecture-specific tuning policies
-  //----------------------------------------------------------------------------
-
-  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
-  {
-    using MergeSortPolicy =
-      AgentMergeSortPolicy<256,
-                           Nominal4BItemsToItems<KeyT>(11),
-                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                           cub::LOAD_LDG,
-                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-
-// NVBug 3384810
-#if defined(_NVHPC_CUDA)
-  using Policy520 = Policy350;
-#else
-  struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
-  {
-    using MergeSortPolicy =
-      AgentMergeSortPolicy<512,
-                           Nominal4BItemsToItems<KeyT>(15),
-                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                           cub::LOAD_LDG,
-                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-#endif
-
-  struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
-  {
-    using MergeSortPolicy =
-      AgentMergeSortPolicy<256,
-                           Nominal4BItemsToItems<KeyT>(17),
-                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                           cub::LOAD_DEFAULT,
-                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
-  };
-
-  /// MaxPolicy
-  using MaxPolicy = Policy600;
-};
-
 template <typename KeyInputIteratorT,
           typename ValueInputIteratorT,
           typename KeyIteratorT,
           typename ValueIteratorT,
           typename OffsetT,
           typename CompareOpT,
-          typename SelectedPolicy = DeviceMergeSortPolicy<KeyIteratorT>>
+          typename SelectedPolicy = detail::merge_sort::policy_hub<KeyIteratorT>>
 struct DispatchMergeSort : SelectedPolicy
 {
   using KeyT   = cub::detail::value_t<KeyIteratorT>;

@@ -46,6 +46,7 @@
 
 #include <cub/agent/agent_reduce.cuh>
 #include <cub/device/dispatch/kernels/reduce.cuh>
+#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/launcher/cuda_runtime.cuh>
@@ -191,130 +192,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   }
 }
 
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-template <typename PolicyT, typename = void>
-struct ReducePolicyWrapper : PolicyT
-{
-  CUB_RUNTIME_FUNCTION ReducePolicyWrapper(PolicyT base)
-      : PolicyT(base)
-  {}
-};
-
-template <typename StaticPolicyT>
-struct ReducePolicyWrapper<StaticPolicyT,
-                           _CUDA_VSTD::void_t<typename StaticPolicyT::ReducePolicy,
-                                              typename StaticPolicyT::SingleTilePolicy,
-                                              typename StaticPolicyT::SegmentedReducePolicy>> : StaticPolicyT
-{
-  CUB_RUNTIME_FUNCTION ReducePolicyWrapper(StaticPolicyT base)
-      : StaticPolicyT(base)
-  {}
-
-  CUB_DEFINE_SUB_POLICY_GETTER(Reduce)
-  CUB_DEFINE_SUB_POLICY_GETTER(SingleTile)
-  CUB_DEFINE_SUB_POLICY_GETTER(SegmentedReduce)
-};
-
-template <typename PolicyT>
-CUB_RUNTIME_FUNCTION ReducePolicyWrapper<PolicyT> MakeReducePolicyWrapper(PolicyT policy)
-{
-  return ReducePolicyWrapper<PolicyT>{policy};
-}
-
-/**
- * @tparam AccumT
- *   Accumulator data type
- *
- * OffsetT
- *   Signed integer type for global offsets
- *
- * ReductionOpT
- *   Binary reduction functor type having member
- *   `auto operator()(const T &a, const U &b)`
- */
-template <typename AccumT, typename OffsetT, typename ReductionOpT>
-struct DeviceReducePolicy
-{
-  //---------------------------------------------------------------------------
-  // Architecture-specific tuning policies
-  //---------------------------------------------------------------------------
-
-  /// SM30
-  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
-  {
-    static constexpr int threads_per_block  = 256;
-    static constexpr int items_per_thread   = 20;
-    static constexpr int items_per_vec_load = 2;
-
-    // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
-    using ReducePolicy =
-      AgentReducePolicy<threads_per_block,
-                        items_per_thread,
-                        AccumT,
-                        items_per_vec_load,
-                        BLOCK_REDUCE_WARP_REDUCTIONS,
-                        LOAD_DEFAULT>;
-
-    // SingleTilePolicy
-    using SingleTilePolicy = ReducePolicy;
-
-    // SegmentedReducePolicy
-    using SegmentedReducePolicy = ReducePolicy;
-  };
-
-  /// SM35
-  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-  {
-    static constexpr int threads_per_block  = 256;
-    static constexpr int items_per_thread   = 20;
-    static constexpr int items_per_vec_load = 4;
-
-    // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B
-    // items)
-    using ReducePolicy =
-      AgentReducePolicy<threads_per_block,
-                        items_per_thread,
-                        AccumT,
-                        items_per_vec_load,
-                        BLOCK_REDUCE_WARP_REDUCTIONS,
-                        LOAD_LDG>;
-
-    // SingleTilePolicy
-    using SingleTilePolicy = ReducePolicy;
-
-    // SegmentedReducePolicy
-    using SegmentedReducePolicy = ReducePolicy;
-  };
-
-  /// SM60
-  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-  {
-    static constexpr int threads_per_block  = 256;
-    static constexpr int items_per_thread   = 16;
-    static constexpr int items_per_vec_load = 4;
-
-    // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
-    using ReducePolicy =
-      AgentReducePolicy<threads_per_block,
-                        items_per_thread,
-                        AccumT,
-                        items_per_vec_load,
-                        BLOCK_REDUCE_WARP_REDUCTIONS,
-                        LOAD_LDG>;
-
-    // SingleTilePolicy
-    using SingleTilePolicy = ReducePolicy;
-
-    // SegmentedReducePolicy
-    using SegmentedReducePolicy = ReducePolicy;
-  };
-
-  using MaxPolicy = Policy600;
-};
-
 template <typename MaxPolicyT,
           typename InputIteratorT,
           typename OutputIteratorT,
@@ -385,7 +262,7 @@ template <typename InputIteratorT,
           typename ReductionOpT,
           typename InitT  = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
           typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
-          typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
+          typename SelectedPolicy = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
           typename TransformOpT   = ::cuda::std::__identity,
           typename KernelSource   = DeviceReduceKernelSource<
               typename SelectedPolicy::MaxPolicy,
@@ -867,7 +744,7 @@ template <
   typename InitT,
   typename AccumT = ::cuda::std::
     __accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
-  typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
+  typename SelectedPolicyT = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
   typename KernelSource    = DeviceReduceKernelSource<
        typename SelectedPolicyT::MaxPolicy,
        InputIteratorT,
@@ -930,7 +807,7 @@ template <typename InputIteratorT,
           typename ReductionOpT,
           typename InitT  = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
           typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
-          typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
+          typename SelectedPolicy = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>>
 struct DispatchSegmentedReduce : SelectedPolicy
 {
   //---------------------------------------------------------------------------