Skip to content

Commit

Permalink
Move remaining CUB policy hubs to tuning headers (#3141)
Browse files Browse the repository at this point in the history
* Move merge policy hub to tuning header
* Move merge_sort policy hub to tuning header
* Move reduce policy hub to tuning header
* Move segmented_sort policy hub to tuning header
* Move transform policy hub to tuning header

Fixes: #3097
  • Loading branch information
bernhardmgruber authored Dec 12, 2024
1 parent 650cbad commit 5141553
Show file tree
Hide file tree
Showing 11 changed files with 853 additions and 688 deletions.
53 changes: 2 additions & 51 deletions cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#endif // no system header

#include <cub/agent/agent_merge.cuh>
#include <cub/device/dispatch/tuning/tuning_merge.cuh>
#include <cub/util_device.cuh>
#include <cub/util_type.cuh>
#include <cub/util_vsmem.cuh>
Expand Down Expand Up @@ -155,56 +156,6 @@ __launch_bounds__(
vsmem_helper_t::discard_temp_storage(temp_storage);
}

template <typename KeyT, typename ValueT>
struct device_merge_policy_hub
{
static constexpr bool has_values = !::cuda::std::is_same<ValueT, NullType>::value;

using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)];

struct policy300 : ChainedPolicy<300, policy300, policy300>
{
using merge_policy =
agent_policy_t<128,
Nominal4BItemsToItems<tune_type>(7),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct policy350 : ChainedPolicy<350, policy350, policy300>
{
using merge_policy =
agent_policy_t<256,
Nominal4BItemsToItems<tune_type>(11),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_LDG,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct policy520 : ChainedPolicy<520, policy520, policy350>
{
using merge_policy =
agent_policy_t<512,
Nominal4BItemsToItems<tune_type>(13),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_LDG,
BLOCK_STORE_WARP_TRANSPOSE>;
};

struct policy600 : ChainedPolicy<600, policy600, policy520>
{
using merge_policy =
agent_policy_t<512,
Nominal4BItemsToItems<tune_type>(15),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_STORE_WARP_TRANSPOSE>;
};

using max_policy = policy600;
};

template <typename KeyIt1,
typename ValueIt1,
typename KeyIt2,
Expand All @@ -213,7 +164,7 @@ template <typename KeyIt1,
typename ValueIt3,
typename Offset,
typename CompareOp,
typename PolicyHub = device_merge_policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
typename PolicyHub = detail::merge::policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
struct dispatch_t
{
void* d_temp_storage;
Expand Down
51 changes: 2 additions & 49 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#endif // no system header

#include <cub/agent/agent_merge_sort.cuh>
#include <cub/device/dispatch/tuning/tuning_merge_sort.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>
Expand Down Expand Up @@ -327,61 +328,13 @@ __launch_bounds__(
* Policy
******************************************************************************/

template <typename KeyIteratorT>
struct DeviceMergeSortPolicy
{
using KeyT = cub::detail::value_t<KeyIteratorT>;

//----------------------------------------------------------------------------
// Architecture-specific tuning policies
//----------------------------------------------------------------------------

struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
using MergeSortPolicy =
AgentMergeSortPolicy<256,
Nominal4BItemsToItems<KeyT>(11),
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_LDG,
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};

// NVBug 3384810
#if defined(_NVHPC_CUDA)
using Policy520 = Policy350;
#else
struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
{
using MergeSortPolicy =
AgentMergeSortPolicy<512,
Nominal4BItemsToItems<KeyT>(15),
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_LDG,
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};
#endif

struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
{
using MergeSortPolicy =
AgentMergeSortPolicy<256,
Nominal4BItemsToItems<KeyT>(17),
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_DEFAULT,
cub::BLOCK_STORE_WARP_TRANSPOSE>;
};

/// MaxPolicy
using MaxPolicy = Policy600;
};

template <typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT,
typename SelectedPolicy = DeviceMergeSortPolicy<KeyIteratorT>>
typename SelectedPolicy = detail::merge_sort::policy_hub<KeyIteratorT>>
struct DispatchMergeSort : SelectedPolicy
{
using KeyT = cub::detail::value_t<KeyIteratorT>;
Expand Down
131 changes: 4 additions & 127 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

#include <cub/agent/agent_reduce.cuh>
#include <cub/device/dispatch/kernels/reduce.cuh>
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
#include <cub/grid/grid_even_share.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/launcher/cuda_runtime.cuh>
Expand Down Expand Up @@ -191,130 +192,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
}
}

/******************************************************************************
* Policy
******************************************************************************/

template <typename PolicyT, typename = void>
struct ReducePolicyWrapper : PolicyT
{
CUB_RUNTIME_FUNCTION ReducePolicyWrapper(PolicyT base)
: PolicyT(base)
{}
};

template <typename StaticPolicyT>
struct ReducePolicyWrapper<StaticPolicyT,
_CUDA_VSTD::void_t<typename StaticPolicyT::ReducePolicy,
typename StaticPolicyT::SingleTilePolicy,
typename StaticPolicyT::SegmentedReducePolicy>> : StaticPolicyT
{
CUB_RUNTIME_FUNCTION ReducePolicyWrapper(StaticPolicyT base)
: StaticPolicyT(base)
{}

CUB_DEFINE_SUB_POLICY_GETTER(Reduce)
CUB_DEFINE_SUB_POLICY_GETTER(SingleTile)
CUB_DEFINE_SUB_POLICY_GETTER(SegmentedReduce)
};

template <typename PolicyT>
CUB_RUNTIME_FUNCTION ReducePolicyWrapper<PolicyT> MakeReducePolicyWrapper(PolicyT policy)
{
return ReducePolicyWrapper<PolicyT>{policy};
}

/**
* @tparam AccumT
* Accumulator data type
*
* OffsetT
* Signed integer type for global offsets
*
* ReductionOpT
* Binary reduction functor type having member
* `auto operator()(const T &a, const U &b)`
*/
template <typename AccumT, typename OffsetT, typename ReductionOpT>
struct DeviceReducePolicy
{
//---------------------------------------------------------------------------
// Architecture-specific tuning policies
//---------------------------------------------------------------------------

/// SM30
struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 20;
static constexpr int items_per_vec_load = 2;

// ReducePolicy (GTX670: 154.0 @ 48M 4B items)
using ReducePolicy =
AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
BLOCK_REDUCE_WARP_REDUCTIONS,
LOAD_DEFAULT>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

/// SM35
struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 20;
static constexpr int items_per_vec_load = 4;

// ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B
// items)
using ReducePolicy =
AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
BLOCK_REDUCE_WARP_REDUCTIONS,
LOAD_LDG>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

/// SM60
struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 16;
static constexpr int items_per_vec_load = 4;

// ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
using ReducePolicy =
AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
BLOCK_REDUCE_WARP_REDUCTIONS,
LOAD_LDG>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

using MaxPolicy = Policy600;
};

template <typename MaxPolicyT,
typename InputIteratorT,
typename OutputIteratorT,
Expand Down Expand Up @@ -385,7 +262,7 @@ template <typename InputIteratorT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
typename SelectedPolicy = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
typename TransformOpT = ::cuda::std::__identity,
typename KernelSource = DeviceReduceKernelSource<
typename SelectedPolicy::MaxPolicy,
Expand Down Expand Up @@ -867,7 +744,7 @@ template <
typename InitT,
typename AccumT = ::cuda::std::
__accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
typename SelectedPolicyT = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>,
typename KernelSource = DeviceReduceKernelSource<
typename SelectedPolicyT::MaxPolicy,
InputIteratorT,
Expand Down Expand Up @@ -930,7 +807,7 @@ template <typename InputIteratorT,
typename ReductionOpT,
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
typename SelectedPolicy = detail::reduce::policy_hub<AccumT, OffsetT, ReductionOpT>>
struct DispatchSegmentedReduce : SelectedPolicy
{
//---------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 5141553

Please sign in to comment.